Python Performance Optimization: Advanced Techniques and Best Practices

Python performance optimization is crucial for building efficient, scalable applications. This comprehensive guide covers advanced techniques for profiling, memory management, concurrency, and optimization strategies that can significantly improve your Python applications’ performance.

Performance Profiling

1. CPU Profiling

cProfile for Function-Level Analysis

import cProfile
import pstats
from functools import wraps

def profile_function(func):
    """Decorator to profile a function"""
    @wraps(func)
    def wrapper(*args, **kwargs):
        profiler = cProfile.Profile()
        profiler.enable()
        result = func(*args, **kwargs)
        profiler.disable()

        stats = pstats.Stats(profiler)
        stats.sort_stats('cumulative')
        stats.print_stats(10)  # Top 10 functions

        return result
    return wrapper

@profile_function
def compute_fibonacci(n):
    """Compute Fibonacci sequence"""
    if n <= 1:
        return n
    return compute_fibonacci(n-1) + compute_fibonacci(n-2)

# Usage
result = compute_fibonacci(30)

Line Profiling with line_profiler

# Install: pip install line_profiler
from line_profiler import LineProfiler

def process_data(data):
    """Process large dataset"""
    result = []
    for item in data:  # Line 4
        processed = item * 2  # Line 5
        if processed > 100:  # Line 6
            result.append(processed)  # Line 7
    return result

# Profile specific lines
lp = LineProfiler(process_data)
lp_wrapper = lp(process_data)

data = list(range(1000))
lp_wrapper(data)

lp.print_stats()

Memory Profiling with memory_profiler

# Install: pip install memory_profiler
from memory_profiler import profile

@profile
def memory_intensive_operation():
    """Function that uses significant memory"""
    large_list = [i for i in range(1000000)]
    processed = [x * 2 for x in large_list]
    filtered = [x for x in processed if x % 3 == 0]
    return len(filtered)

# Usage
result = memory_intensive_operation()

2. Advanced Profiling Tools

Py-Spy for Production Profiling

# Install: pip install py-spy
# Profile running Python process
py-spy top --pid 12345

# Generate flame graph
py-spy record --pid 12345 --output profile.svg

# Profile specific function
py-spy record --pid 12345 --include "my_module::my_function"

Scalene for Comprehensive Profiling

# Install: pip install scalene
from scalene import scalene_profiler

@scalene_profiler
def optimized_function():
    """Function with Scalene profiling"""
    data = list(range(100000))
    result = sum(x * x for x in data)
    return result

# Usage
result = optimized_function()

Memory Optimization

1. Memory-Efficient Data Structures

Using Generators Instead of Lists

# Memory inefficient
def get_squares_list(n):
    """Returns list of squares - memory intensive"""
    return [i * i for i in range(n)]

# Memory efficient
def get_squares_generator(n):
    """Returns generator of squares - memory efficient"""
    for i in range(n):
        yield i * i

# Usage comparison
import sys

# List approach (high memory)
squares_list = get_squares_list(1000000)
print(f"List memory: {sys.getsizeof(squares_list)} bytes")

# Generator approach (low memory)
squares_gen = get_squares_generator(1000000)
print(f"Generator memory: {sys.getsizeof(squares_gen)} bytes")

Using Array Module for Numeric Data

import array
import sys

# Regular list (high memory)
regular_list = [1, 2, 3, 4, 5]
print(f"List memory: {sys.getsizeof(regular_list)} bytes")

# Array module (low memory)
numeric_array = array.array('i', [1, 2, 3, 4, 5])
print(f"Array memory: {sys.getsizeof(numeric_array)} bytes")

# For large datasets
large_array = array.array('d', range(1000000))  # Double precision
print(f"Large array memory: {sys.getsizeof(large_array)} bytes")

Using NumPy for Scientific Computing

import numpy as np
import sys

# Python list
python_list = list(range(1000000))
print(f"Python list memory: {sys.getsizeof(python_list)} bytes")

# NumPy array
numpy_array = np.arange(1000000, dtype=np.int32)
print(f"NumPy array memory: {sys.getsizeof(numpy_array)} bytes")

# Vectorized operations (much faster)
def sum_with_python():
    return sum(x * x for x in python_list)

def sum_with_numpy():
    return np.sum(numpy_array * numpy_array)

import time
start = time.time()
result1 = sum_with_python()
python_time = time.time() - start

start = time.time()
result2 = sum_with_numpy()
numpy_time = time.time() - start

print(f"Python time: {python_time:.4f}s")
print(f"NumPy time: {numpy_time:.4f}s")
print(f"Speedup: {python_time/numpy_time:.2f}x")

2. Memory Management Techniques

Context Managers for Resource Management

class DatabaseConnection:
    """Database connection with proper resource management"""

    def __init__(self, connection_string):
        self.connection_string = connection_string
        self.connection = None

    def __enter__(self):
        # Establish connection
        self.connection = self._connect()
        return self.connection

    def __exit__(self, exc_type, exc_val, exc_tb):
        # Clean up connection
        if self.connection:
            self.connection.close()
            self.connection = None

    def _connect(self):
        # Simulate database connection
        return f"Connected to {self.connection_string}"

# Usage
with DatabaseConnection("postgresql://localhost/db") as conn:
    # Use connection
    print(f"Using connection: {conn}")
# Connection automatically closed

Weak References for Caching

import weakref
from functools import lru_cache

class DataProcessor:
    """Class with weak reference caching"""

    def __init__(self):
        self._cache = weakref.WeakValueDictionary()

    def process_data(self, data_id):
        """Process data with caching"""
        if data_id in self._cache:
            return self._cache[data_id]

        # Simulate expensive processing
        result = f"Processed data for {data_id}"

        # Cache with weak reference
        self._cache[data_id] = result
        return result

# LRU cache for function memoization
@lru_cache(maxsize=128)
def expensive_computation(x, y):
    """Expensive computation with LRU caching"""
    import time
    time.sleep(0.1)  # Simulate work
    return x * y + x + y

# Usage
processor = DataProcessor()
result1 = processor.process_data("data_1")
result2 = processor.process_data("data_1")  # From cache

Concurrency and Parallelism

1. Threading for I/O-Bound Tasks

Thread Pool Executor

import concurrent.futures
import time
import requests

def fetch_url(url):
    """Fetch URL content"""
    try:
        response = requests.get(url, timeout=5)
        return {"url": url, "status": response.status_code, "length": len(response.text)}
    except Exception as e:
        return {"url": url, "error": str(e)}

def fetch_multiple_urls(urls):
    """Fetch multiple URLs concurrently"""
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        # Submit all tasks
        future_to_url = {executor.submit(fetch_url, url): url for url in urls}

        results = []
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                results.append({"url": url, "error": str(e)})

    return results

# Usage
urls = [
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/2",
    "https://httpbin.org/delay/1",
]

start = time.time()
results = fetch_multiple_urls(urls)
print(f"Time taken: {time.time() - start:.2f}s")

Asyncio for High-Concurrency I/O

import asyncio
import aiohttp
import time

async def fetch_url_async(session, url):
    """Fetch URL asynchronously"""
    try:
        async with session.get(url, timeout=5) as response:
            text = await response.text()
            return {"url": url, "status": response.status, "length": len(text)}
    except Exception as e:
        return {"url": url, "error": str(e)}

async def fetch_multiple_urls_async(urls):
    """Fetch multiple URLs asynchronously"""
    connector = aiohttp.TCPConnector(limit=100)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [fetch_url_async(session, url) for url in urls]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return results

# Usage
async def main():
    urls = [
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/2",
        "https://httpbin.org/delay/1",
    ]

    start = time.time()
    results = await fetch_multiple_urls_async(urls)
    print(f"Time taken: {time.time() - start:.2f}s")

# Run async main
asyncio.run(main())

2. Multiprocessing for CPU-Bound Tasks

Process Pool Executor

import concurrent.futures
import time
import math

def compute_prime_factors(n):
    """Compute prime factors of a number"""
    factors = []
    d = 2
    while d * d <= n:
        while (n % d) == 0:
            factors.append(d)
            n //= d
        d += 1
    if n > 1:
        factors.append(n)
    return factors

def factorize_numbers(numbers):
    """Factorize multiple numbers in parallel"""
    with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
        results = list(executor.map(compute_prime_factors, numbers))
    return results

# Usage
numbers = [1234567, 2345678, 3456789, 4567890]

start = time.time()
results = factorize_numbers(numbers)
print(f"Parallel time: {time.time() - start:.4f}s")

# Sequential comparison
start = time.time()
results_seq = [compute_prime_factors(n) for n in numbers]
print(f"Sequential time: {time.time() - start:.4f}s")

Shared Memory for Multiprocessing

import multiprocessing as mp
import numpy as np
import time

def process_chunk(shared_array, start_idx, end_idx, result_queue):
    """Process a chunk of shared array"""
    chunk = shared_array[start_idx:end_idx]
    processed = np.sum(chunk ** 2)
    result_queue.put(processed)

def parallel_array_processing():
    """Process large array in parallel"""
    # Create shared memory array
    manager = mp.Manager()
    shared_array = manager.list(np.random.rand(10000000))

    # Split work among processes
    num_processes = mp.cpu_count()
    chunk_size = len(shared_array) // num_processes

    processes = []
    result_queue = mp.Queue()

    for i in range(num_processes):
        start_idx = i * chunk_size
        end_idx = (i + 1) * chunk_size if i < num_processes - 1 else len(shared_array)

        process = mp.Process(
            target=process_chunk,
            args=(shared_array, start_idx, end_idx, result_queue)
        )
        processes.append(process)
        process.start()

    # Collect results
    results = []
    for _ in range(num_processes):
        results.append(result_queue.get())

    # Wait for processes to complete
    for process in processes:
        process.join()

    return sum(results)

# Usage
start = time.time()
result = parallel_array_processing()
print(f"Parallel processing time: {time.time() - start:.4f}s")

Algorithm Optimization

1. Efficient Data Structures

Using Sets for Membership Testing

import time

# List membership testing (O(n))
large_list = list(range(1000000))
start = time.time()
result = 999999 in large_list
list_time = time.time() - start

# Set membership testing (O(1))
large_set = set(range(1000000))
start = time.time()
result = 999999 in large_set
set_time = time.time() - start

print(f"List time: {list_time:.6f}s")
print(f"Set time: {set_time:.6f}s")
print(f"Speedup: {list_time/set_time:.2f}x")

Using Deque for Efficient Queue Operations

from collections import deque
import time

# List as queue (inefficient)
queue_list = []
start = time.time()
for i in range(100000):
    queue_list.append(i)  # O(1)
for i in range(100000):
    queue_list.pop(0)    # O(n) - inefficient!
list_time = time.time() - start

# Deque as queue (efficient)
queue_deque = deque()
start = time.time()
for i in range(100000):
    queue_deque.append(i)     # O(1)
for i in range(100000):
    queue_deque.popleft()      # O(1) - efficient!
deque_time = time.time() - start

print(f"List queue time: {list_time:.4f}s")
print(f"Deque queue time: {deque_time:.4f}s")
print(f"Speedup: {list_time/deque_time:.2f}x")

2. Algorithmic Improvements

Memoization for Dynamic Programming

from functools import lru_cache
import time

# Recursive Fibonacci (exponential time)
def fibonacci_recursive(n):
    """Inefficient recursive Fibonacci"""
    if n <= 1:
        return n
    return fibonacci_recursive(n-1) + fibonacci_recursive(n-2)

# Memoized Fibonacci (linear time)
@lru_cache(maxsize=None)
def fibonacci_memoized(n):
    """Efficient memoized Fibonacci"""
    if n <= 1:
        return n
    return fibonacci_memoized(n-1) + fibonacci_memoized(n-2)

# Iterative Fibonacci (linear time, no recursion)
def fibonacci_iterative(n):
    """Efficient iterative Fibonacci"""
    if n <= 1:
        return n

    a, b = 0, 1
    for _ in range(2, n + 1):
        a, b = b, a + b
    return b

# Performance comparison
n = 35

start = time.time()
result1 = fibonacci_recursive(n)
recursive_time = time.time() - start

start = time.time()
result2 = fibonacci_memoized(n)
memoized_time = time.time() - start

start = time.time()
result3 = fibonacci_iterative(n)
iterative_time = time.time() - start

print(f"Recursive time: {recursive_time:.4f}s")
print(f"Memoized time: {memoized_time:.6f}s")
print(f"Iterative time: {iterative_time:.6f}s")

Binary Search for Sorted Data

import bisect
import time
import random

# Linear search (O(n))
def linear_search(arr, target):
    """Linear search implementation"""
    for i, val in enumerate(arr):
        if val == target:
            return i
    return -1

# Binary search using bisect (O(log n))
def binary_search(arr, target):
    """Binary search using bisect module"""
    index = bisect.bisect_left(arr, target)
    if index < len(arr) and arr[index] == target:
        return index
    return -1

# Performance comparison
sorted_data = sorted(random.sample(range(10000000), 1000000))
target = sorted_data[500000]  # Middle element

start = time.time()
result1 = linear_search(sorted_data, target)
linear_time = time.time() - start

start = time.time()
result2 = binary_search(sorted_data, target)
binary_time = time.time() - start

print(f"Linear search time: {linear_time:.6f}s")
print(f"Binary search time: {binary_time:.6f}s")
print(f"Speedup: {linear_time/binary_time:.2f}x")

Caching Strategies

1. Function Result Caching

Custom Cache Implementation

import time
from functools import wraps

class TimedCache:
    """Cache with time expiration"""

    def __init__(self, timeout=300):
        self.timeout = timeout
        self.cache = {}

    def __call__(self, func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            key = str(args) + str(sorted(kwargs.items()))
            current_time = time.time()

            # Check if cached result exists and is not expired
            if key in self.cache:
                result, timestamp = self.cache[key]
                if current_time - timestamp < self.timeout:
                    return result

            # Compute and cache result
            result = func(*args, **kwargs)
            self.cache[key] = (result, current_time)
            return result

        wrapper.cache_clear = self.cache.clear
        return wrapper

@TimedCache(timeout=60)
def expensive_computation(x, y):
    """Expensive computation with timed cache"""
    time.sleep(1)  # Simulate work
    return x * y + x + y

# Usage
print(expensive_computation(5, 3))  # Takes 1 second
print(expensive_computation(5, 3))  # Instant (from cache)

Redis for Distributed Caching

import redis
import pickle
import hashlib
import json

class RedisCache:
    """Redis-based distributed cache"""

    def __init__(self, host='localhost', port=6379, db=0):
        self.redis_client = redis.Redis(host=host, port=port, db=db)

    def _generate_key(self, func_name, args, kwargs):
        """Generate cache key from function and arguments"""
        key_data = {
            'func': func_name,
            'args': args,
            'kwargs': sorted(kwargs.items())
        }
        key_str = json.dumps(key_data, sort_keys=True)
        return hashlib.md5(key_str.encode()).hexdigest()

    def get(self, key):
        """Get value from cache"""
        try:
            value = self.redis_client.get(key)
            if value:
                return pickle.loads(value)
        except Exception:
            pass
        return None

    def set(self, key, value, timeout=3600):
        """Set value in cache"""
        try:
            serialized = pickle.dumps(value)
            self.redis_client.setex(key, timeout, serialized)
        except Exception:
            pass

    def cached(self, timeout=3600):
        """Decorator for caching function results"""
        def decorator(func):
            @wraps(func)
            def wrapper(*args, **kwargs):
                key = self._generate_key(func.__name__, args, kwargs)

                # Try to get from cache
                result = self.get(key)
                if result is not None:
                    return result

                # Compute and cache result
                result = func(*args, **kwargs)
                self.set(key, result, timeout)
                return result

            return wrapper
        return decorator

# Usage
redis_cache = RedisCache()

@redis_cache.cached(timeout=300)
def api_call(endpoint, params):
    """Simulate expensive API call"""
    time.sleep(2)  # Simulate network delay
    return {"endpoint": endpoint, "params": params, "data": "response"}

result1 = api_call("/users", {"page": 1})  # Takes 2 seconds
result2 = api_call("/users", {"page": 1})  # Instant (from cache)

Code Optimization Techniques

1. Built-in Function Optimization

Using Built-in Functions

import time

# Custom implementation (slower)
def custom_sum(numbers):
    """Custom sum implementation"""
    total = 0
    for num in numbers:
        total += num
    return total

# Built-in sum (faster)
def builtin_sum(numbers):
    """Built-in sum implementation"""
    return sum(numbers)

# Performance comparison
numbers = list(range(1000000))

start = time.time()
result1 = custom_sum(numbers)
custom_time = time.time() - start

start = time.time()
result2 = builtin_sum(numbers)
builtin_time = time.time() - start

print(f"Custom sum time: {custom_time:.4f}s")
print(f"Built-in sum time: {builtin_time:.4f}s")
print(f"Speedup: {custom_time/builtin_time:.2f}x")

List Comprehensions vs Loops

import time

# Traditional loop (slower)
def process_with_loop(data):
    """Process data with traditional loop"""
    result = []
    for item in data:
        if item % 2 == 0:
            result.append(item * 2)
    return result

# List comprehension (faster)
def process_with_comprehension(data):
    """Process data with list comprehension"""
    return [item * 2 for item in data if item % 2 == 0]

# Performance comparison
data = list(range(1000000))

start = time.time()
result1 = process_with_loop(data)
loop_time = time.time() - start

start = time.time()
result2 = process_with_comprehension(data)
comprehension_time = time.time() - start

print(f"Loop time: {loop_time:.4f}s")
print(f"Comprehension time: {comprehension_time:.4f}s")
print(f"Speedup: {loop_time/comprehension_time:.2f}x")

2. String Optimization

String Joining Techniques

import time

# String concatenation with + (inefficient)
def concatenate_with_plus(strings):
    """Concatenate strings using + operator"""
    result = ""
    for s in strings:
        result += s
    return result

# String joining with join (efficient)
def concatenate_with_join(strings):
    """Concatenate strings using join method"""
    return "".join(strings)

# Performance comparison
strings = ["hello"] * 10000

start = time.time()
result1 = concatenate_with_plus(strings)
plus_time = time.time() - start

start = time.time()
result2 = concatenate_with_join(strings)
join_time = time.time() - start

print(f"Plus concatenation time: {plus_time:.4f}s")
print(f"Join concatenation time: {join_time:.4f}s")
print(f"Speedup: {plus_time/join_time:.2f}x")

String Formatting Methods

import time

# Different string formatting methods
name = "Alice"
age = 30
city = "New York"

# % formatting (old style)
def format_old_style():
    return "Name: %s, Age: %d, City: %s" % (name, age, city)

# str.format() (middle style)
def format_str_format():
    return "Name: {}, Age: {}, City: {}".format(name, age, city)

# f-strings (new style - fastest)
def format_f_string():
    return f"Name: {name}, Age: {age}, City: {city}"

# Performance comparison
iterations = 1000000

start = time.time()
for _ in range(iterations):
    format_old_style()
old_style_time = time.time() - start

start = time.time()
for _ in range(iterations):
    format_str_format()
str_format_time = time.time() - start

start = time.time()
for _ in range(iterations):
    format_f_string()
f_string_time = time.time() - start

print(f"Old style time: {old_style_time:.4f}s")
print(f"str.format() time: {str_format_time:.4f}s")
print(f"f-string time: {f_string_time:.4f}s")

Advanced Optimization Techniques

1. Cython for Performance-Critical Code

Cython Implementation

def fibonacci_cython(int n):
    """Fast Fibonacci implementation in Cython"""
    cdef int a, b, i, temp
    if n <= 1:
        return n

    a, b = 0, 1
    for i in range(2, n + 1):
        temp = a + b
        a = b
        b = temp
    return b

# setup.py for compilation
from setuptools import setup
from Cython.Build import cythonize

setup(ext_modules=cythonize("fibonacci.pyx"))

Python Wrapper and Usage

import time
import pyximport
pyximport.install()

# Import Cython module
import fibonacci_cython

def fibonacci_python(n):
    """Python Fibonacci implementation"""
    if n <= 1:
        return n

    a, b = 0, 1
    for i in range(2, n + 1):
        a, b = b, a + b
    return b

# Performance comparison
n = 1000000

start = time.time()
result1 = fibonacci_python(n)
python_time = time.time() - start

start = time.time()
result2 = fibonacci_cython.fibonacci_cython(n)
cython_time = time.time() - start

print(f"Python time: {python_time:.4f}s")
print(f"Cython time: {cython_time:.4f}s")
print(f"Speedup: {python_time/cython_time:.2f}x")

2. Numba for JIT Compilation

Numba Optimization

import numba
import numpy as np
import time

# Regular Python function
def sum_array_python(arr):
    """Regular Python array sum"""
    total = 0
    for i in range(len(arr)):
        total += arr[i] * arr[i]
    return total

# Numba JIT compiled function
@numba.jit(nopython=True)
def sum_array_numba(arr):
    """Numba JIT compiled array sum"""
    total = 0
    for i in range(len(arr)):
        total += arr[i] * arr[i]
    return total

# Performance comparison
data = np.random.rand(1000000)

# First call includes compilation time
start = time.time()
result1 = sum_array_numba(data)
numba_first_time = time.time() - start

# Subsequent calls use compiled version
start = time.time()
result2 = sum_array_numba(data)
numba_time = time.time() - start

start = time.time()
result3 = sum_array_python(data)
python_time = time.time() - start

print(f"Python time: {python_time:.4f}s")
print(f"Numba first call: {numba_first_time:.4f}s")
print(f"Numba compiled: {numba_time:.4f}s")
print(f"Speedup: {python_time/numba_time:.2f}x")

Performance Monitoring

1. Real-time Performance Monitoring

Custom Performance Monitor

import time
import psutil
import threading
from collections import deque

class PerformanceMonitor:
    """Real-time performance monitoring"""

    def __init__(self, interval=1.0, history_size=60):
        self.interval = interval
        self.history_size = history_size
        self.cpu_history = deque(maxlen=history_size)
        self.memory_history = deque(maxlen=history_size)
        self.running = False
        self.thread = None

    def start_monitoring(self):
        """Start performance monitoring"""
        self.running = True
        self.thread = threading.Thread(target=self._monitor_loop)
        self.thread.daemon = True
        self.thread.start()

    def stop_monitoring(self):
        """Stop performance monitoring"""
        self.running = False
        if self.thread:
            self.thread.join()

    def _monitor_loop(self):
        """Monitoring loop"""
        while self.running:
            cpu_percent = psutil.cpu_percent()
            memory_percent = psutil.virtual_memory().percent

            self.cpu_history.append(cpu_percent)
            self.memory_history.append(memory_percent)

            time.sleep(self.interval)

    def get_stats(self):
        """Get performance statistics"""
        if not self.cpu_history:
            return {}

        return {
            'cpu_avg': sum(self.cpu_history) / len(self.cpu_history),
            'cpu_max': max(self.cpu_history),
            'cpu_min': min(self.cpu_history),
            'memory_avg': sum(self.memory_history) / len(self.memory_history),
            'memory_max': max(self.memory_history),
            'memory_min': min(self.memory_history),
        }

# Usage
monitor = PerformanceMonitor()
monitor.start_monitoring()

# Run some intensive computation
def intensive_task():
    total = 0
    for i in range(10000000):
        total += i * i
    return total

result = intensive_task()

# Get performance stats
stats = monitor.get_stats()
print(f"Performance stats: {stats}")

monitor.stop_monitoring()

2. Performance Benchmarking Framework

Benchmarking Decorator

import time
import statistics
from functools import wraps

def benchmark(repetitions=10, warmup=1):
    """Benchmark decorator for performance testing"""
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            times = []

            # Warmup runs
            for _ in range(warmup):
                func(*args, **kwargs)

            # Actual benchmark runs
            for _ in range(repetitions):
                start = time.perf_counter()
                result = func(*args, **kwargs)
                end = time.perf_counter()
                times.append(end - start)

            # Calculate statistics
            mean_time = statistics.mean(times)
            median_time = statistics.median(times)
            min_time = min(times)
            max_time = max(times)
            std_dev = statistics.stdev(times) if len(times) > 1 else 0

            print(f"Benchmark results for {func.__name__}:")
            print(f"  Mean: {mean_time:.6f}s")
            print(f"  Median: {median_time:.6f}s")
            print(f"  Min: {min_time:.6f}s")
            print(f"  Max: {max_time:.6f}s")
            print(f"  Std Dev: {std_dev:.6f}s")
            print(f"  Repetitions: {repetitions}")

            return result

        return wrapper
    return decorator

# Usage
@benchmark(repetitions=100, warmup=5)
def test_function():
    """Function to benchmark"""
    return sum(i * i for i in range(10000))

result = test_function()

Best Practices Summary

Performance Optimization Checklist

Profiling and Measurement

Profile before optimizing
Measure performance improvements
Use appropriate profiling tools
Establish performance benchmarks
Monitor in production

Memory Optimization

Use generators for large datasets
Choose appropriate data structures
Implement proper resource management
Use weak references for caching
Monitor memory usage

Concurrency and Parallelism

Use threading for I/O-bound tasks
Use multiprocessing for CPU-bound tasks
Implement async/await for high-concurrency I/O
Consider shared memory for multiprocessing
Use appropriate synchronization primitives

Algorithm and Data Structure Optimization

Choose optimal algorithms (O(log n) vs O(n))
Use appropriate data structures (sets for membership)
Implement memoization for repeated computations
Use built-in functions when possible
Consider binary search for sorted data

Code-Level Optimization

Use list comprehensions instead of loops
Optimize string operations (join vs +)
Use f-strings for string formatting
Consider Cython for critical code paths
Use Numba for numerical computations

Caching Strategies

Implement function result caching
Use Redis for distributed caching
Consider time-based cache expiration
Cache database query results
Implement cache invalidation strategies

Conclusion

Python performance optimization requires a systematic approach combining profiling, algorithmic improvements, and implementation techniques. Key takeaways:

Profile First: Always measure before optimizing
Choose Right Tools: Use appropriate profiling and optimization tools
Algorithm Matters: Better algorithms beat micro-optimizations
Memory Awareness: Understand memory usage patterns
Concurrency Strategy: Choose threading, multiprocessing, or async based on workload
Caching Works: Implement smart caching strategies
Monitor Continuously: Track performance in production

Remember that optimization is a trade-off between performance, readability, and maintainability. Focus on critical code paths and measure actual improvements rather than theoretical gains.