functions.jl
1 using Statistics: quantile, mean, std, var, cov 2 using Clustering: kmeans, kmedoids 3 using Distributions: Normal, cdf 4 using LinearAlgebra: eigen, Symmetric 5 using Distances: pairwise, Euclidean 6 using StatsBase: mode 7 using Strategies: asset_bysym 8 using Strategies: DateTime 9 using Strategies.Lang: @lget! 10 using StatsBase: StatsBase 11 12 function sort_col_byrowsum!(df) 13 # first calculate row sum 14 rowsums = sum.(abs.(v) for v in eachcol(df)) 15 # then sort by row sum 16 indices = sortperm(rowsums) 17 select!(df, indices) 18 end 19 20 # a function that quantiles a dataframe 21 function quantile_df(df, q) 22 sort_col_byrowsum!(df) 23 ans = DFT[] 24 for row in eachrow(df) 25 push!(ans, quantile(row, q)) 26 end 27 return ans 28 end 29 30 function cluster_df(df, n=2) 31 idx = kmeans(Matrix(df), n).assignments 32 colnames = names(df) 33 groups = [] 34 for i in 1:n 35 push!(groups, colnames[idx .== i]) 36 end 37 return groups 38 end 39 40 """ 41 find_lead_lag_pairs(corr_dict::Dict, threshold::Float64=0.7; max_lag::Int=3) 42 43 Identify lead-lag relationships between assets based on cross-correlation. 44 45 # Arguments 46 - `corr_dict`: Dictionary of correlation DataFrames with lags as keys 47 - `threshold`: Minimum absolute correlation to consider a relationship 48 - `max_lag`: Maximum lag to consider for lead-lag relationships 49 50 # Returns 51 - A DataFrame with columns [:asset1, :asset2, :lag, :correlation] showing significant lead-lag pairs 52 """ 53 function find_lead_lag_pairs(corr_dict::Dict, threshold::Float64=0.7; max_lag::Int=3) 54 pairs = DataFrame(asset1=String[], asset2=String[], lag=Int[], correlation=Float64[]) 55 56 for (lag, df) in corr_dict 57 abs(lag) > max_lag && continue 58 59 x_assets = metadata(df, "x_assets") 60 y_assets = names(df) 61 62 for (i, y_asset) in enumerate(y_assets) 63 for (j, x_asset) in enumerate(x_assets) 64 corr_val = df[i, j] 65 if abs(corr_val) >= threshold 66 push!(pairs, (x_asset, y_asset, lag, corr_val)) 67 end 68 end 69 end 70 end 71 72 return sort(pairs, :correlation, rev=true) 73 end 74 75 """ 76 detect_correlation_regime(corr_matrix::AbstractMatrix, window::Int=20; n_regimes::Int=2) 77 78 Detect market regimes based on changes in correlation structure. 79 80 # Arguments 81 - `corr_matrix`: Time series of correlation matrices (3D array or vector of matrices) 82 - `window`: Rolling window for regime detection 83 - `n_regimes`: Number of regimes to detect 84 85 # Returns 86 - A vector of regime labels for each time period 87 """ 88 function detect_correlation_regime(corr_matrices::AbstractArray, window::Int=20; n_regimes::Int=2) 89 n = size(corr_matrices, 3) 90 features = zeros(n, size(corr_matrices, 1) * size(corr_matrices, 2)) 91 92 # Flatten correlation matrices into feature vectors 93 for i in 1:n 94 features[i, :] = vec(corr_matrices[:, :, i]) 95 end 96 97 # Use k-medoids for regime detection 98 dist = pairwise(Euclidean(), features; dims=2) 99 clusters = kmedoids(dist, n_regimes) 100 101 # Smooth the regime labels with a rolling window 102 smoothed_regimes = ones(Int, n) 103 for i in window:size(dist, 1) 104 window_regimes = clusters.assignments[(i-window+1):i] 105 smoothed_regimes[i] = mode(window_regimes) 106 end 107 108 # Fill the beginning with the first detected regime 109 smoothed_regimes[1:window-1] .= smoothed_regimes[window] 110 111 return smoothed_regimes 112 end 113 114 """ 115 find_cointegrated_pairs(prices::Dict{String,Vector{Float64}}; pvalue_threshold::Float64=0.05) 116 117 Find cointegrated pairs of assets using the Engle-Granger test. 118 119 # Arguments 120 - `prices`: Dictionary of price series with asset names as keys 121 - `pvalue_threshold`: Maximum p-value to consider a pair cointegrated 122 123 # Returns 124 - A DataFrame with cointegrated pairs and test statistics 125 """ 126 function find_cointegrated_prices(prices::Dict{String,Vector{Float64}}; pvalue_threshold::Float64=0.05) 127 assets = collect(keys(prices)) 128 n = length(assets) 129 results = DataFrame( 130 asset1=String[], asset2=String[], 131 coint_pvalue=Float64[], adf_pvalue=Float64[], 132 half_life=Float64[] 133 ) 134 135 for i in 1:(n-1) 136 for j in (i+1):n 137 asset1, asset2 = assets[i], assets[j] 138 p1, p2 = prices[asset1], prices[asset2] 139 140 # Test for cointegration using Engle-Granger test 141 # (Implementation depends on your statistical package) 142 # This is a placeholder - replace with actual cointegration test 143 coint_pvalue = 0.0 # Replace with actual test 144 145 if coint_pvalue < pvalue_threshold 146 # Calculate half-life of mean reversion 147 spread = p1 .- p2 148 spread_lag = [NaN; spread[1:end-1]] 149 delta = spread[2:end] .- spread_lag[2:end] 150 beta = cov(delta, spread_lag[2:end]) / var(spread_lag[2:end]) 151 half_life = -log(2) / beta 152 153 # Add to results 154 push!(results, (asset1, asset2, coint_pvalue, 0.0, half_life)) 155 end 156 end 157 end 158 159 return sort(results, :coint_pvalue) 160 end 161 162 export sort_col_byrowsum!, quantile_df, cluster_df, find_lead_lag_pairs, detect_correlation_regime, find_cointegrated_prices