dataframes.jl
1 @doc "Utilities for DataFrames.jl, prominently timeframe based indexing." 2 module DFUtils 3 using DataFrames 4 using DataFrames: index 5 using ..TimeTicks 6 import ..TimeTicks: TimeTicks, timeframe, timeframe! 7 import Misc: after, before 8 using Misc.DocStringExtensions 9 using ..Lang 10 import Base: getindex 11 import ..Data: contiguous_ts 12 13 @doc "Get the column names for dataframe as symbols. 14 15 $(TYPEDSIGNATURES) 16 " 17 colnames(df::AbstractDataFrame) = index(df).names 18 19 @doc "Get the first timestamp in the dataframe (:timestamp column)." 20 firstdate(df::D) where {D<:AbstractDataFrame} = df.timestamp[begin] 21 @doc "Get the last timestamp in the dataframe (:timestamp column)." 22 lastdate(df::D) where {D<:AbstractDataFrame} = df.timestamp[end] 23 @doc "The zeroed row of a dataframe (`zero(el)` from every column)." 24 function zerorow(df::D; skip_cols=()) where {D<:AbstractDataFrame} 25 let cn = ((col for col in colnames(df) if col ∉ skip_cols)...,) 26 NamedTuple{cn}(zero(eltype(getproperty(df, col))) for col in cn) 27 end 28 end 29 30 @doc """Returns the timeframe of a dataframe according to its metadata. 31 32 $(TYPEDSIGNATURES) 33 34 If the value is not found in the metadata, infer it by `timestamp` column of the dataframe. 35 If the timeframe can't be inferred, a `TimeFrame(0)` is returned. 36 NOTE: slow func, for speed use [`timeframe!(::DataFrame)`](@ref)""" 37 function timeframe(df::D)::TimeFrame where {D<:AbstractDataFrame} 38 if hasproperty(df, :timestamp) 39 md = @lget!(colmetadata(df), :timestamp, Dict{String,Any}()) 40 @something get(md, "timeframe", nothing) begin 41 if size(df, 1) > 0 42 timeframe!(df) 43 else 44 TimeFrame(Second(0)) 45 end 46 end 47 end 48 end 49 @doc "Sets the dataframe's timeframe metadata to the given `TimeFrame`. 50 51 Shouldn't be called directly, see [`timeframe!(::DataFrame)`](@ref)" 52 function timeframe!(df::D, t::T) where {D<:AbstractDataFrame,T<:TimeFrame} 53 colmetadata!(df, :timestamp, "timeframe", t; style=:note) 54 t 55 end 56 @doc "Infer the dataframe's timeframe from the `timestamp` column of the dataframe and sets it." 57 function timeframe!(df::D) where {D<:AbstractDataFrame} 58 @something colmetadata(df, :timestamp, "timeframe", nothing) begin 59 tf = @infertf(df) 60 colmetadata!(df, :timestamp, "timeframe", tf; style=:note) 61 tf 62 end 63 end 64 @doc "Forcefully infers the dataframe timeframe. See [`timeframe!(::DataFrame)`](@ref)" 65 timeframe!!(df::D) where {D<:AbstractDataFrame} = begin 66 tf = @infertf(df) 67 timeframe!(df, tf) 68 tf 69 end 70 71 @doc "Get the position of date in the `:timestamp` column of the dataframe. 72 73 $(TYPEDSIGNATURES)" 74 function dateindex(df::D, date::DateTime) where {D<:AbstractDataFrame} 75 searchsortedlast(df.timestamp, date) 76 end 77 78 @doc "Get the position of date in the `:timestamp` column of the dataframe based on timeframe arithmentics. 79 80 $(TYPEDSIGNATURES)" 81 function dateindex(df::D, date::DateTime, ::Val{:timeframe}) where {D<:AbstractDataFrame} 82 (date - firstdate(df)) ÷ timeframe(df).period + 1 83 end 84 85 # TODO: move dateindex to TimeTicks 86 @doc "Same as [`dateindex`](@ref)" 87 function dateindex(v::V, date::DateTime) where {V<:AbstractVector} 88 searchsortedlast(v, date) 89 end 90 91 @doc "Same [`dateindex(::AbstractVector, ::DateTime)`](@ref) but always returns the first index if the index is not found in the vector." 92 function dateindex(v::V, date::DateTime, ::Val{:nonzero}) where {V<:AbstractVector} 93 idx = dateindex(v, date) 94 if iszero(idx) 95 firstindex(v) 96 else 97 idx 98 end 99 end 100 101 @doc "Same [`dateindex(::AbstractDataFrame, ::DateTime)`](@ref) but always returns the first index if the index is not found in the vector." 102 function dateindex(df::AbstractDataFrame, date::DateTime, ::Val{:nonzero}) 103 dateindex(df.timestamp, date, Val(:nonzero)) 104 end 105 dateindex(v, date, sym::Symbol) = dateindex(v, date, Val(sym)) 106 107 valueorview(df::DataFrame, idx, col::String) = getproperty(df, col)[idx] 108 valueorview(df::DataFrame, idx, col::Symbol) = getproperty(df, col)[idx] 109 valueorview(df::DataFrame, idx, cols) = @view df[idx, cols] 110 # NOTE: We should subtype an abstract dataframe...arr 111 function getindex(df::D, idx::DateTime, cols) where {D<:AbstractDataFrame} 112 v = valueorview(df, searchsortedlast(df.timestamp, idx), cols) 113 @ifdebug @assert v == getdate(df, idx, cols) 114 v 115 end 116 117 @doc """Get the specified columns based on given date (used as index). 118 119 $(TYPEDSIGNATURES) 120 121 While indexing ohlcv data we have to consider the *time of arrival* of a candle. 122 In general candles collect the price *up to* its timestamp. 123 E.g. the candle at time `2000-01-01` would have tracked time from `1999-12-31T00:00:00` to `2000-01-01T00:00:00`. 124 Therefore what we return is always the *left adjacent* timestamp of the queried one. 125 """ 126 function getdate( 127 df::D, idx::DateTime, cols, tf::T=timeframe!(df) 128 ) where {D<:AbstractDataFrame,T<:TimeFrame} 129 @ifdebug @assert @infertf(df) == tf 130 start = firstdate(df) 131 # start = df.timestamp[begin] 132 start <= idx || throw(ArgumentError("$idx not found in dataframe.")) 133 int_idx = (idx - start) ÷ tf.period + 1 134 int_idx > size(df)[1] && throw(ArgumentError("$idx not found in dataframe.")) 135 @ifdebug @assert df.timestamp[int_idx] == idx 136 valueorview(df, int_idx, cols) 137 end 138 139 @doc """Get the date-based subset of a DataFrame. 140 141 $(TYPEDSIGNATURES) 142 143 Indexing by date ranges allows to query ohlcv using the timestamp column as index, assuming that the data has no missing values and is already sorted. 144 145 Examples: 146 df[dtr"1999-.."] # Starting from 1999 up to the end 147 df[dtr"..1999-"] # From the beginning up to 1999 148 df[dtr"1999-..2000-"] # The Year 1999 149 """ 150 function getdate( 151 df::D, dr::Union{DateRange,StepRange{DateTime,<:Period}}, cols, tf=timeframe!(df) 152 ) where {D<:AbstractDataFrame} 153 @ifdebug @assert @infertf(df) == tf 154 start = firstdate(df) 155 stop = lastdate(df) 156 if (!isnothing(dr.start) && dr.start < start) || (!isnothing(dr.stop) && dr.stop > stop) 157 throw( 158 ArgumentError( 159 "Dates ($(dr.start) : $(dr.stop)) out of range for dataframe ($start : $stop).", 160 ), 161 ) 162 end 163 # arithmetic indexing although slower for smaller arrays, has complexity O(1)ish 164 start_idx = if isnothing(dr.start) 165 firstindex(df.timestamp) 166 else 167 (dr.start - start) ÷ tf.period + 1 168 end 169 stop_idx = if isnothing(dr.stop) 170 lastindex(df.timestamp) 171 else 172 start_idx + (dr.stop - dr.start) ÷ tf.period 173 end 174 # start_idx = searchsortedfirst(df.timestamp, dr.start) 175 # stop_idx = start_idx + searchsortedfirst(@view(df.timestamp[start_idx+1:end]), dr.stop) 176 @ifdebug @assert df.timestamp[start_idx] == dr.start && 177 df.timestamp[stop_idx] == dr.stop 178 @view df[start_idx:stop_idx, cols] 179 end 180 181 function getindex( 182 df::D, dr::Union{DateRange,StepRange{DateTime,<:Period}}, cols 183 ) where {D<:AbstractDataFrame} 184 start_idx = searchsortedfirst(df.timestamp, dr.start) 185 stop_idx = searchsortedlast(df.timestamp, dr.stop) 186 v = @view df[start_idx:stop_idx, cols] 187 @ifdebug @assert v == getdate(df, dr, cols) 188 v 189 end 190 191 function getindex(df::D, idx::DateTime) where {D<:AbstractDataFrame} 192 getindex(df, idx, Symbol.(names(df))) 193 end 194 function getindex( 195 df::D, idx::Union{DateRange,StepRange{DateTime,<:Period}} 196 ) where {D<:AbstractDataFrame} 197 getindex(df, idx, Symbol.(names(df))) 198 end 199 200 @doc """Get the date range of a DataFrame. 201 202 $(TYPEDSIGNATURES) 203 204 Used to get the date range of a DataFrame `df`. It takes in the DataFrame `df`, an optional timeframe `tf` (default is the current timeframe of the DataFrame), and an optional `rightofs` parameter. 205 The `rightofs` parameter specifies the number of steps to shift the date range to the right. For example, if `rightofs` is set to 1, the date range will be shifted one step to the right. This can be useful for calculating future date ranges based on the current date range. 206 Returns the date range of the DataFrame `df` based on the specified timeframe `tf` and `rightofs` parameter. 207 """ 208 function daterange(df::D, tf=timeframe(df), rightofs=1) where {D<:AbstractDataFrame} 209 DateRange(df.timestamp[begin], df.timestamp[end] + tf * rightofs, tf) 210 end 211 212 _copysub(arr::A) where {A<:Array} = arr 213 _copysub(arr::A) where {A<:SubArray} = Array(arr) 214 215 @doc "Replaces subarrays with arrays. 216 217 $(TYPEDSIGNATURES)" 218 function copysubs!( 219 df::D, copyfunc=_copysub, elsefunc=Returns(nothing) 220 ) where {D<:AbstractDataFrame} 221 i = 1 222 mask = Vector{Bool}(undef, ncol(df)) 223 for col in eachcol(df) 224 if (mask[i] = col isa SubArray) 225 df[!, i] = copyfunc(col) 226 else 227 elsefunc(col) 228 end 229 i += 1 230 end 231 mask 232 end 233 234 function _make_room(df, capacity, n) 235 if n < 0 236 throw(ArgumentError("n must be non-negative")) 237 end 238 if capacity < 0 239 throw(ArgumentError("capacity must be non-negative")) 240 end 241 current_rows = nrow(df) 242 # Ensure we only delete rows if appending `n` more rows would exceed `capacity` 243 if current_rows + n > capacity 244 copysubs!(df) 245 rows_to_remove = current_rows + n - capacity 246 # Ensure we do not attempt to delete more rows than exist 247 if rows_to_remove > current_rows 248 empty!(df) 249 else 250 # Delete rows from the beginning of the dataframe 251 deleteat!(df, 1:rows_to_remove) 252 end 253 end 254 end 255 256 @doc "Mutates `v` to `df` ensuring the dataframe never grows larger than `maxlen`. 257 258 $(TYPEDSIGNATURES) 259 " 260 function _mutatemax!(df, v, maxlen, n, mut; cols=:union) 261 _make_room(df, maxlen, n) 262 mut(df, v; cols) 263 @ifdebug @assert nrow(df) <= maxlen 264 end 265 266 function _tomaxlen(v, maxlen) 267 li = lastindex(v, 1) 268 from = li - min(maxlen, li) + 1 269 view(v, from:li, :) 270 end 271 272 @doc "See [`_mutatemax!`](@ref)" 273 function appendmax!(df, v, maxlen; cols=:union) 274 _mutatemax!(df, _tomaxlen(v, maxlen), maxlen, size(v, 1), append!; cols) 275 end 276 @doc "See [`_mutatemax!`](@ref)" 277 function prependmax!(df, v, maxlen; cols=:union) 278 _mutatemax!(df, _tomaxlen(v, maxlen), maxlen, size(v, 1), prepend!; cols) 279 end 280 @doc "See [`_mutatemax!`](@ref)" 281 pushmax!(df, v, maxlen; cols=:union) = _mutatemax!(df, v, maxlen, 1, push!; cols) 282 283 function contiguous_ts(df::DataFrame, args...; kwargs...) 284 contiguous_ts(df.timestamp, string(timeframe!(df)), args...; kwargs...) 285 end 286 287 @doc """Get the subset of a DataFrame containing rows after a specific date. 288 289 $(TYPEDSIGNATURES) 290 291 This function is used to get the subset of a DataFrame `df` that contains rows after a specific date `dt`. It takes in the DataFrame `df`, the specific date `dt` as a `DateTime` object, and optional columns `cols` to include in the subset. 292 If `cols` is not specified, the function includes all columns in the subset. If `cols` is specified, only the columns listed in `cols` will be included in the subset. 293 This function returns a `DataFrameView` that contains only the rows of `df` that occur after the specified date `dt` and the specified columns `cols`. 294 """ 295 function after(df::DataFrame, dt::DateTime, cols=:) 296 idx = dateindex(df, dt) + 1 297 view(df, idx:nrow(df), cols) 298 end 299 300 @doc "Complement of [`after`](@ref)" 301 function before(df::DataFrame, dt::DateTime, cols=:) 302 idx = dateindex(df, dt) - 1 303 view(df, 1:idx, cols) 304 end 305 306 @doc "Inserts rows in `src` to `dst`, zeroing columns not present in `dst`. 307 308 $(TYPEDSIGNATURES) 309 " 310 function addcols!(dst, src) 311 src_cols = Set(colnames(src)) 312 dst_cols = colnames(dst) 313 n = nrow(dst) 314 for col in src_cols 315 if col ∉ dst_cols 316 dst[!, col] = similar(getproperty(src, col), n) 317 end 318 end 319 end 320 321 _fromidx(from::Integer, offset::Integer) = from + offset 322 _fromidx(from::Integer, offset) = from + round(Int, offset, RoundUp) 323 324 @doc """Create a view of an OHLCV DataFrame starting from a specific index. 325 326 $(TYPEDSIGNATURES) 327 328 Used to create a view of an OHLCV DataFrame `ohlcv` starting from a specific index `from`. It takes in the OHLCV DataFrame `ohlcv`, the starting index `from` as an integer, and optional parameters `offset` and `cols`. 329 The `offset` parameter specifies the number of rows to offset the view from the starting index. The default value is 0, indicating no offset. 330 The `cols` parameter specifies the columns to include in the view. By default, all columns are included. 331 Returns a view of the original OHLCV DataFrame `ohlcv` starting from the specified index `from`, with an optional offset and specified columns. 332 """ 333 function viewfrom(ohlcv, from::Integer; offset=0, cols=Colon()) 334 @view ohlcv[max(1, _fromidx(from, offset)):end, cols] 335 end 336 337 function viewfrom(ohlcv, from::DateTime; kwargs...) 338 idx = dateindex(ohlcv, from) 339 viewfrom(ohlcv, idx; kwargs...) 340 end 341 342 function viewfrom(ohlcv, ::Nothing; kwargs...) 343 ohlcv 344 end 345 346 @doc """Set the values of specific columns in one DataFrame from another DataFrame. 347 348 $(TYPEDSIGNATURES) 349 350 Used to set the values of specific columns in one DataFrame `dst` from another DataFrame `src`. It takes in the destination DataFrame `dst`, the source DataFrame `src`, the columns to set `cols`, and optional indices `idx` to specify the rows to set. 351 The `cols` parameter specifies the columns in the destination DataFrame `dst` that will be set with the corresponding values from the source DataFrame `src`. 352 The `idx` parameter specifies the indices of the rows in the destination DataFrame `dst` that will be set. By default, it sets all rows. 353 It mutates the destination DataFrame `dst` by setting the values of the specified columns `cols` with the corresponding values from the source DataFrame `src`. 354 """ 355 function setcols!(dst, src, cols, idx=firstindex(dst, 1):lastindex(dst, 1)) 356 data_type = eltype(src) 357 for (n, col) in enumerate(cols) 358 if !hasproperty(dst, col) 359 dst[!, col] = Vector{data_type}(1:size(dst, 1)) 360 end 361 dst[idx, col] = @view src[:, n] 362 end 363 end 364 365 export firstdate, lastdate, dateindex, daterange, viewfrom 366 export colnames, getdate, zerorow, addcols!, setcols! 367 368 end