/ FeatureSelection / src / functions.jl
functions.jl
  1  using Statistics: quantile, mean, std, var, cov
  2  using Clustering: kmeans, kmedoids
  3  using Distributions: Normal, cdf
  4  using LinearAlgebra: eigen, Symmetric
  5  using Distances: pairwise, Euclidean
  6  using StatsBase: mode
  7  using Strategies: asset_bysym
  8  using Strategies: DateTime
  9  using Strategies.Lang: @lget!
 10  using StatsBase: StatsBase
 11  
 12  function sort_col_byrowsum!(df)
 13      # first calculate row sum
 14      rowsums = sum.(abs.(v) for v in eachcol(df))
 15      # then sort by row sum
 16      indices = sortperm(rowsums)
 17      select!(df, indices)
 18  end
 19  
 20  # a function that quantiles a dataframe
 21  function quantile_df(df, q)
 22      sort_col_byrowsum!(df)
 23      ans = DFT[]
 24      for row in eachrow(df)
 25          push!(ans, quantile(row, q))
 26      end
 27      return ans
 28  end
 29  
 30  function cluster_df(df, n=2)
 31      idx = kmeans(Matrix(df), n).assignments
 32      colnames = names(df)
 33      groups = []
 34      for i in 1:n
 35          push!(groups, colnames[idx .== i])
 36      end
 37      return groups
 38  end
 39  
 40  """
 41      find_lead_lag_pairs(corr_dict::Dict, threshold::Float64=0.7; max_lag::Int=3)
 42  
 43  Identify lead-lag relationships between assets based on cross-correlation.
 44  
 45  # Arguments
 46  - `corr_dict`: Dictionary of correlation DataFrames with lags as keys
 47  - `threshold`: Minimum absolute correlation to consider a relationship
 48  - `max_lag`: Maximum lag to consider for lead-lag relationships
 49  
 50  # Returns
 51  - A DataFrame with columns [:asset1, :asset2, :lag, :correlation] showing significant lead-lag pairs
 52  """
 53  function find_lead_lag_pairs(corr_dict::Dict, threshold::Float64=0.7; max_lag::Int=3)
 54      pairs = DataFrame(asset1=String[], asset2=String[], lag=Int[], correlation=Float64[])
 55      
 56      for (lag, df) in corr_dict
 57          abs(lag) > max_lag && continue
 58          
 59          x_assets = metadata(df, "x_assets")
 60          y_assets = names(df)
 61          
 62          for (i, y_asset) in enumerate(y_assets)
 63              for (j, x_asset) in enumerate(x_assets)
 64                  corr_val = df[i, j]
 65                  if abs(corr_val) >= threshold
 66                      push!(pairs, (x_asset, y_asset, lag, corr_val))
 67                  end
 68              end
 69          end
 70      end
 71      
 72      return sort(pairs, :correlation, rev=true)
 73  end
 74  
 75  """
 76      detect_correlation_regime(corr_matrix::AbstractMatrix, window::Int=20; n_regimes::Int=2)
 77  
 78  Detect market regimes based on changes in correlation structure.
 79  
 80  # Arguments
 81  - `corr_matrix`: Time series of correlation matrices (3D array or vector of matrices)
 82  - `window`: Rolling window for regime detection
 83  - `n_regimes`: Number of regimes to detect
 84  
 85  # Returns
 86  - A vector of regime labels for each time period
 87  """
 88  function detect_correlation_regime(corr_matrices::AbstractArray, window::Int=20; n_regimes::Int=2)
 89      n = size(corr_matrices, 3)
 90      features = zeros(n, size(corr_matrices, 1) * size(corr_matrices, 2))
 91      
 92      # Flatten correlation matrices into feature vectors
 93      for i in 1:n
 94          features[i, :] = vec(corr_matrices[:, :, i])
 95      end
 96      
 97      # Use k-medoids for regime detection
 98      dist = pairwise(Euclidean(), features; dims=2)
 99      clusters = kmedoids(dist, n_regimes)
100  
101      # Smooth the regime labels with a rolling window
102      smoothed_regimes = ones(Int, n)
103      for i in window:size(dist, 1)
104          window_regimes = clusters.assignments[(i-window+1):i]
105          smoothed_regimes[i] = mode(window_regimes)
106      end
107      
108      # Fill the beginning with the first detected regime
109      smoothed_regimes[1:window-1] .= smoothed_regimes[window]
110      
111      return smoothed_regimes
112  end
113  
114  """
115      find_cointegrated_pairs(prices::Dict{String,Vector{Float64}}; pvalue_threshold::Float64=0.05)
116  
117  Find cointegrated pairs of assets using the Engle-Granger test.
118  
119  # Arguments
120  - `prices`: Dictionary of price series with asset names as keys
121  - `pvalue_threshold`: Maximum p-value to consider a pair cointegrated
122  
123  # Returns
124  - A DataFrame with cointegrated pairs and test statistics
125  """
126  function find_cointegrated_prices(prices::Dict{String,Vector{Float64}}; pvalue_threshold::Float64=0.05)
127      assets = collect(keys(prices))
128      n = length(assets)
129      results = DataFrame(
130          asset1=String[], asset2=String[], 
131          coint_pvalue=Float64[], adf_pvalue=Float64[], 
132          half_life=Float64[]
133      )
134      
135      for i in 1:(n-1)
136          for j in (i+1):n
137              asset1, asset2 = assets[i], assets[j]
138              p1, p2 = prices[asset1], prices[asset2]
139              
140              # Test for cointegration using Engle-Granger test
141              # (Implementation depends on your statistical package)
142              # This is a placeholder - replace with actual cointegration test
143              coint_pvalue = 0.0  # Replace with actual test
144              
145              if coint_pvalue < pvalue_threshold
146                  # Calculate half-life of mean reversion
147                  spread = p1 .- p2
148                  spread_lag = [NaN; spread[1:end-1]]
149                  delta = spread[2:end] .- spread_lag[2:end]
150                  beta = cov(delta, spread_lag[2:end]) / var(spread_lag[2:end])
151                  half_life = -log(2) / beta
152                  
153                  # Add to results
154                  push!(results, (asset1, asset2, coint_pvalue, 0.0, half_life))
155              end
156          end
157      end
158      
159      return sort(results, :coint_pvalue)
160  end
161  
162  export sort_col_byrowsum!, quantile_df, cluster_df, find_lead_lag_pairs, detect_correlation_regime, find_cointegrated_prices