Numba¶
Numba JIT Compilation:
The compute_percentiles function is compiled with Numba for faster percentile calculations. The bootstrap_sampling function is also compiled with Numba to efficiently generate bootstrap indices. Vectorized Sampling:
Bootstrap indices are generated in a single step using np.random.randint and applied to the DataFrame. Preallocated Arrays:
Arrays for bootstrap_weights, bootstrap_random_variable, and bootstrap_comparison are preallocated for better memory management. DataFrame to NumPy Conversion:
The DataFrame is converted to a NumPy array (df.values) for faster indexing during bootstrap sampling. This refactored code should significantly improve performance, especially for large datasets and a high number of bootstrap
import numpy as np
import matplotlib.pyplot as plt
from numba import jit
@jit(nopython=True)
def compute_percentiles(data, alpha):
"""Compute lower and upper percentiles."""
lower = np.percentile(data, alpha * 100 / 2, axis=0)
upper = np.percentile(data, 100 - alpha * 100 / 2, axis=0)
return lower, upper
@jit(nopython=True)
def bootstrap_sampling(df_values, num_bootstrap, num_samples):
"""Perform bootstrap sampling."""
bootstrap_indices = np.random.randint(0, num_samples, size=(num_bootstrap, num_samples))
return bootstrap_indices
def optimized_bootstrap_relative_weights(df, outcome, drivers, focal=None, num_bootstrap=10000, compare="No", alpha=0.05):
def compute_rel_wts(sample):
return relativeImp(sample, outcome, drivers)['rawRelaImpt'].values
def random_variable(sample):
return add_random_variable(sample, outcome, drivers)['rawRelaImpt'].values
def compare_preds(sample):
return compare_predictors(sample, outcome, drivers, focal)['weightDiff'].values
# Convert DataFrame to NumPy array for faster processing
df_values = df.values
num_samples = len(df)
# Preallocate arrays for bootstrap results
num_drivers = len(drivers)
bootstrap_weights = np.zeros((num_bootstrap, num_drivers))
bootstrap_random_variable = np.zeros((num_bootstrap, num_drivers))
bootstrap_comparison = np.zeros((num_bootstrap, num_drivers)) if compare == "Yes" else None
# Generate bootstrap indices using Numba
bootstrap_indices = bootstrap_sampling(df_values, num_bootstrap, num_samples)
for i in range(num_bootstrap):
sample = df.iloc[bootstrap_indices[i]]
try:
bootstrap_weights[i] = compute_rel_wts(sample)
bootstrap_random_variable[i] = random_variable(sample)
if compare == "Yes":
bootstrap_comparison[i] = compare_preds(sample)
except Exception as e:
print(f"Error in bootstrap sample {i}: {e}")
# Compute confidence intervals using Numba
ci_relative_weights_lower, ci_relative_weights_upper = compute_percentiles(bootstrap_weights, alpha)
ci_random_variable_lower, ci_random_variable_upper = compute_percentiles(bootstrap_random_variable, alpha)
result = {
'relative_weights': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_relative_weights_lower,
'ci_upper': ci_relative_weights_upper,
'ci_median': np.median(bootstrap_weights, axis=0),
},
'random_variable_diff': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_random_variable_lower,
'ci_upper': ci_random_variable_upper,
'ci_median': np.median(bootstrap_random_variable, axis=0),
}
}
if compare == "Yes":
ci_comparison_lower, ci_comparison_upper = compute_percentiles(bootstrap_comparison, alpha)
comparisons = [f"{d}-{focal}" for d in drivers if d != focal]
result['comparison_diff'] = {
'comparison': comparisons,
'ci_lower': ci_comparison_lower,
'ci_upper': ci_comparison_upper,
'ci_median': np.median(bootstrap_comparison, axis=0),
}
# Plot histograms
plt.figure(figsize=(10, 4 * num_drivers))
for i, driver in enumerate(drivers):
weights = bootstrap_weights[:, i]
median = np.median(weights)
plt.subplot(num_drivers, 1, i + 1)
plt.hist(weights, bins=50, alpha=0.5, color='blue', label='Relative Weights')
plt.axvline(ci_relative_weights_lower[i], color='red', linestyle='--', label='Lower Bound')
plt.axvline(ci_relative_weights_upper[i], color='green', linestyle='--', label='Upper Bound')
plt.axvline(median, color='purple', linestyle='-', label='Median')
plt.title(f"Distribution of Relative Weights for {driver}")
plt.ylabel('Frequency')
plt.legend()
plt.show()
return result
Parallel Processing with Numba¶
import numpy as np
import matplotlib.pyplot as plt
from numba import jit, prange
@jit(nopython=True)
def compute_percentiles(data, alpha):
"""Compute lower and upper percentiles manually."""
num_samples, num_features = data.shape
lower = np.zeros(num_features)
upper = np.zeros(num_features)
for j in range(num_features):
sorted_column = np.sort(data[:, j]) # Sort each column manually
lower_idx = int(alpha * 0.5 * num_samples)
upper_idx = int((1 - alpha * 0.5) * num_samples)
lower[j] = sorted_column[lower_idx]
upper[j] = sorted_column[upper_idx]
return lower, upper
@jit(nopython=True, parallel=True)
def bootstrap_sampling(data, num_bootstrap):
"""Perform bootstrap sampling and compute results."""
num_samples, num_features = data.shape
bootstrap_weights = np.zeros((num_bootstrap, num_features))
for i in prange(num_bootstrap):
indices = np.random.randint(0, num_samples, size=num_samples)
sample = data[indices]
# Compute mean manually along axis 0
for j in range(num_features):
bootstrap_weights[i, j] = sample[:, j].sum() / num_samples
return bootstrap_weights
def optimized_bootstrap_relative_weights(df, outcome, drivers, num_bootstrap=10000, alpha=0.05):
# Convert DataFrame to NumPy array (ensure uniform data type)
data = df[drivers].to_numpy(dtype=np.float64)
# Perform bootstrap sampling using Numba
bootstrap_weights = bootstrap_sampling(data, num_bootstrap)
# Compute confidence intervals
ci_lower, ci_upper = compute_percentiles(bootstrap_weights, alpha)
ci_median = np.median(bootstrap_weights, axis=0)
# Prepare result
result = {
'relative_weights': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_lower,
'ci_upper': ci_upper,
'ci_median': ci_median,
}
}
# Plot histograms
num_drivers = len(drivers)
plt.figure(figsize=(10, 4 * num_drivers))
for i, driver in enumerate(drivers):
weights = bootstrap_weights[:, i]
median = ci_median[i]
plt.subplot(num_drivers, 1, i + 1)
plt.hist(weights, bins=50, alpha=0.5, color='blue', label='Relative Weights')
plt.axvline(ci_lower[i], color='red', linestyle='--', label='Lower Bound')
plt.axvline(ci_upper[i], color='green', linestyle='--', label='Upper Bound')
plt.axvline(median, color='purple', linestyle='-', label='Median')
plt.title(f"Distribution of Relative Weights for {driver}")
plt.ylabel('Frequency')
plt.legend()
plt.show()
return result
Parallel Processing:¶
The ProcessPoolExecutor is used to distribute the bootstrap sampling across multiple CPU cores. Each bootstrap iteration is handled by the bootstrap_worker function, which processes a single sample. Efficient Data Handling:
Results from each worker are collected and converted into NumPy arrays for further processing. Error Handling:
Errors in individual bootstrap samples are logged, and invalid results are skipped. Confidence Interval Calculation:
Percentiles are calculated on the aggregated results after all workers complete their tasks. This approach leverages parallelism to significantly reduce the runtime for large datasets and a high number of bootstrap iterations.
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
def compute_rel_wts(sample, outcome, drivers):
return relativeImp(sample, outcome, drivers)['rawRelaImpt'].values
def random_variable(sample, outcome, drivers):
return add_random_variable(sample, outcome, drivers)['rawRelaImpt'].values
def compare_preds(sample, outcome, drivers, focal):
return compare_predictors(sample, outcome, drivers, focal)['weightDiff'].values
def bootstrap_worker(df, outcome, drivers, focal, compare):
"""Worker function to process a single bootstrap sample."""
indices = np.random.choice(df.index, size=len(df), replace=True)
sample = df.loc[indices]
try:
relative_weights = compute_rel_wts(sample, outcome, drivers)
rand_weights = random_variable(sample, outcome, drivers)
comp_weights = compare_preds(sample, outcome, drivers, focal) if compare == "Yes" else None
return relative_weights, rand_weights, comp_weights
except Exception as e:
print(f"Error in bootstrap sample: {e}")
return None, None, None
def parallel_bootstrap_relative_weights(df, outcome, drivers, focal=None, num_bootstrap=10000, compare="No", alpha=0.05):
num_drivers = len(drivers)
# Preallocate arrays for bootstrap results
bootstrap_weights = []
bootstrap_random_variable = []
bootstrap_comparison = [] if compare == "Yes" else None
# Use ProcessPoolExecutor for parallel processing
with ProcessPoolExecutor() as executor:
futures = [
executor.submit(bootstrap_worker, df, outcome, drivers, focal, compare)
for _ in range(num_bootstrap)
]
for future in futures:
relative_weights, rand_weights, comp_weights = future.result()
if relative_weights is not None:
bootstrap_weights.append(relative_weights)
bootstrap_random_variable.append(rand_weights)
if compare == "Yes":
bootstrap_comparison.append(comp_weights)
# Convert results to NumPy arrays
bootstrap_weights = np.array(bootstrap_weights)
bootstrap_random_variable = np.array(bootstrap_random_variable)
if compare == "Yes":
bootstrap_comparison = np.array(bootstrap_comparison)
# Compute confidence intervals
ci_relative_weights = np.percentile(bootstrap_weights, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
ci_random_variable = np.percentile(bootstrap_random_variable, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
result = {
'relative_weights': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_relative_weights[0],
'ci_upper': ci_relative_weights[1],
'ci_median': np.median(bootstrap_weights, axis=0),
},
'random_variable_diff': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_random_variable[0],
'ci_upper': ci_random_variable[1],
'ci_median': np.median(bootstrap_random_variable, axis=0),
}
}
if compare == "Yes":
ci_comparison = np.percentile(bootstrap_comparison, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
comparisons = [f"{d}-{focal}" for d in drivers if d != focal]
result['comparison_diff'] = {
'comparison': comparisons,
'ci_lower': ci_comparison[0],
'ci_upper': ci_comparison[1],
'ci_median': np.median(bootstrap_comparison, axis=0),
}
# Plot histograms
plt.figure(figsize=(10, 4 * num_drivers))
for i, driver in enumerate(drivers):
weights = bootstrap_weights[:, i]
median = np.median(weights)
plt.subplot(num_drivers, 1, i + 1)
plt.hist(weights, bins=50, alpha=0.5, color='blue', label='Relative Weights')
plt.axvline(ci_relative_weights[0][i], color='red', linestyle='--', label='Lower Bound')
plt.axvline(ci_relative_weights[1][i], color='green', linestyle='--', label='Upper Bound')
plt.axvline(median, color='purple', linestyle='-', label='Median')
plt.title(f"Distribution of Relative Weights for {driver}")
plt.ylabel('Frequency')
plt.legend()
plt.show()
return result
new¶
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import jit, prange
from multiprocessing import Pool
@jit(nopython=True)
def compute_rel_wts(sample, outcome, drivers):
return relativeImp(sample, outcome, drivers)['rawRelaImpt'].values
@jit(nopython=True)
def random_variable(sample, outcome, drivers):
return add_random_variable(sample, outcome, drivers)['rawRelaImpt'].values
@jit(nopython=True)
def compare_preds(sample, outcome, drivers, focal):
return compare_predictors(sample, outcome, drivers, focal)['weightDiff'].values
def bootstrap_sample(df, outcome, drivers, focal, compare):
indices = np.random.choice(df.index, size=len(df), replace=True)
sample = df.loc[indices]
try:
relative_weights = compute_rel_wts(sample, outcome, drivers)
rand_weights = random_variable(sample, outcome, drivers)
comp_weights = compare_preds(sample, outcome, drivers, focal) if compare == "Yes" else None
return relative_weights, rand_weights, comp_weights
except Exception as e:
print(f"Error in bootstrap sample: {e}")
return None, None, None
def bootstrap_relative_weights(df, outcome, drivers, focal=None, num_bootstrap=10000, compare="No", alpha=0.05):
with Pool() as pool:
results = pool.starmap(bootstrap_sample, [(df, outcome, drivers, focal, compare) for _ in range(num_bootstrap)])
bootstrap_weights = [res[0] for res in results if res[0] is not None]
bootstrap_random_variable = [res[1] for res in results if res[1] is not None]
bootstrap_comparision = [res[2] for res in results if res[2] is not None]
ci_relative_weights = np.percentile(bootstrap_weights, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
ci_random_variable = np.percentile(bootstrap_random_variable, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
result = {
'relative_weights': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_relative_weights[0],
'ci_upper': ci_relative_weights[1],
'ci_median': np.median(bootstrap_weights, axis=0),
},
'random_variable_diff': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_random_variable[0],
'ci_upper': ci_random_variable[1],
'ci_median': np.median(bootstrap_random_variable, axis=0)
}
}
if compare == "Yes":
ci_comparision = np.percentile(bootstrap_comparision, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
comparisions = [f"{d}-{focal}" for d in drivers if d != focal]
result['comparision_diff'] = {
'comparision': comparisions,
'ci_lower': ci_comparision[0],
'ci_upper': ci_comparision[1],
'ci_median': np.median(bootstrap_comparision)
}
num_drivers = len(drivers)
plt.figure(figsize=(10, 4 * num_drivers))
for i, driver in enumerate(drivers):
weights = [rw[i] for rw in bootstrap_weights]
median = np.median(weights)
plt.subplot(num_drivers, 1, i + 1)
plt.hist(weights, bins=50, alpha=0.5, color='blue', label='Relative Weights')
plt.axvline(ci_relative_weights[0][i], color='red', linestyle='--', label='Lower Bound')
plt.axvline(ci_relative_weights[1][i], color='green', linestyle='--', label='Upper Bound')
plt.axvline(median, color='purple', linestyle='-', label='Median')
plt.title(f"Distribution of Relative Weights for {driver}")
plt.ylabel('Frequency')
plt.legend()
plt.show()
return result
Numba New¶
from numba import jit, njit, prange
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
@njit
def compute_rel_wts_numba(corrXX, corrXY):
w_corrXX, v_corrXX = np.linalg.eig(corrXX)
diag = np.diag(np.sqrt(w_corrXX))
coef_xz = v_corrXX @ diag @ v_corrXX.T
coef_yz = np.linalg.inv(coef_xz) @ corrXY
rsquare = np.sum(coef_yz**2)
rawWeights = (coef_xz**2) @ (coef_yz**2)
normWeights = (rawWeights / rsquare) * 100
return rawWeights
@njit(parallel=True)
def bootstrap_loop(df_values, num_bootstrap, outcome_idx, driver_indices):
n_samples = df_values.shape[0]
n_drivers = len(driver_indices)
bootstrap_weights = np.zeros((num_bootstrap, n_drivers))
for b in prange(num_bootstrap):
indices = np.random.choice(n_samples, size=n_samples, replace=True)
sample = df_values[indices]
corrALL = np.corrcoef(sample.T)
corrXX = corrALL[1:, 1:]
corrXY = corrALL[1:, 0]
bootstrap_weights[b] = compute_rel_wts_numba(corrXX, corrXY)
return bootstrap_weights
def bootstarp_relative_weights(df, outcome, drivers, num_bootstrap=10000, alpha=0.05):
# Prepare data
all_columns = [outcome] + drivers
df_numeric = df[all_columns].apply(pd.to_numeric, errors='coerce').dropna()
df_values = df_numeric.values
outcome_idx = 0
driver_indices = list(range(1, len(all_columns)))
# Run bootstrap loop
bootstrap_weights = bootstrap_loop(df_values, num_bootstrap, outcome_idx, driver_indices)
# Compute confidence intervals
ci_level = (1 - alpha) * 100
ci_relative_weights = np.percentile(bootstrap_weights, [alpha * 100 / 2, 100 - alpha * 100 / 2], axis=0)
result = {
'relative_weights': {
'Outcome': outcome,
'Drivers': drivers,
'ci_lower': ci_relative_weights[0],
'ci_upper': ci_relative_weights[1],
'ci_median': np.median(bootstrap_weights, axis=0),
}
}
# Plot results
num_drivers = len(drivers)
plt.figure(figsize=(10, 4 * num_drivers))
for i, driver in enumerate(drivers):
weights = bootstrap_weights[:, i]
median = np.median(weights)
plt.subplot(num_drivers, 1, i + 1)
plt.hist(weights, bins=50, alpha=0.5, color='blue', label='Relative Weights')
plt.axvline(ci_relative_weights[0][i], color='red', linestyle='--', label='Lower Bound')
plt.axvline(ci_relative_weights[1][i], color='green', linestyle='--', label='Upper Bound')
plt.axvline(median, color='purple', linestyle='-', label='Median')
plt.title(f"Distribution of Relative Weights for {driver}")
plt.ylabel('Frequency')
plt.legend()
plt.show()
return result
Last update 2024-12-09