/ Data / src / dataframes.jl
dataframes.jl
  1  @doc "Utilities for DataFrames.jl, prominently timeframe based indexing."
  2  module DFUtils
  3  using DataFrames
  4  using DataFrames: index
  5  using ..TimeTicks
  6  import ..TimeTicks: TimeTicks, timeframe, timeframe!
  7  import Misc: after, before
  8  using Misc.DocStringExtensions
  9  using ..Lang
 10  import Base: getindex
 11  import ..Data: contiguous_ts
 12  
 13  @doc "Get the column names for dataframe as symbols.
 14  
 15  $(TYPEDSIGNATURES)
 16  "
 17  colnames(df::AbstractDataFrame) = index(df).names
 18  
 19  @doc "Get the first timestamp in the dataframe (:timestamp column)."
 20  firstdate(df::D) where {D<:AbstractDataFrame} = df.timestamp[begin]
 21  @doc "Get the last timestamp in the dataframe (:timestamp column)."
 22  lastdate(df::D) where {D<:AbstractDataFrame} = df.timestamp[end]
 23  @doc "The zeroed row of a dataframe (`zero(el)` from every column)."
 24  function zerorow(df::D; skip_cols=()) where {D<:AbstractDataFrame}
 25      let cn = ((col for col in colnames(df) if col ∉ skip_cols)...,)
 26          NamedTuple{cn}(zero(eltype(getproperty(df, col))) for col in cn)
 27      end
 28  end
 29  
 30  @doc """Returns the timeframe of a dataframe according to its metadata.
 31  
 32  $(TYPEDSIGNATURES)
 33  
 34  If the value is not found in the metadata, infer it by `timestamp` column of the dataframe.
 35  If the timeframe can't be inferred, a `TimeFrame(0)` is returned.
 36  NOTE: slow func, for speed use [`timeframe!(::DataFrame)`](@ref)"""
 37  function timeframe(df::D)::TimeFrame where {D<:AbstractDataFrame}
 38      if hasproperty(df, :timestamp)
 39          md = @lget!(colmetadata(df), :timestamp, Dict{String,Any}())
 40          @something get(md, "timeframe", nothing) begin
 41              if size(df, 1) > 0
 42                  timeframe!(df)
 43              else
 44                  TimeFrame(Second(0))
 45              end
 46          end
 47      end
 48  end
 49  @doc "Sets the dataframe's timeframe metadata to the given `TimeFrame`.
 50  
 51  Shouldn't be called directly, see [`timeframe!(::DataFrame)`](@ref)"
 52  function timeframe!(df::D, t::T) where {D<:AbstractDataFrame,T<:TimeFrame}
 53      colmetadata!(df, :timestamp, "timeframe", t; style=:note)
 54      t
 55  end
 56  @doc "Infer the dataframe's timeframe from the `timestamp` column of the dataframe and sets it."
 57  function timeframe!(df::D) where {D<:AbstractDataFrame}
 58      @something colmetadata(df, :timestamp, "timeframe", nothing) begin
 59          tf = @infertf(df)
 60          colmetadata!(df, :timestamp, "timeframe", tf; style=:note)
 61          tf
 62      end
 63  end
 64  @doc "Forcefully infers the dataframe timeframe. See [`timeframe!(::DataFrame)`](@ref)"
 65  timeframe!!(df::D) where {D<:AbstractDataFrame} = begin
 66      tf = @infertf(df)
 67      timeframe!(df, tf)
 68      tf
 69  end
 70  
 71  @doc "Get the position of date in the `:timestamp` column of the dataframe.
 72  
 73  $(TYPEDSIGNATURES)"
 74  function dateindex(df::D, date::DateTime) where {D<:AbstractDataFrame}
 75      searchsortedlast(df.timestamp, date)
 76  end
 77  
 78  @doc "Get the position of date in the `:timestamp` column of the dataframe based on timeframe arithmentics.
 79  
 80  $(TYPEDSIGNATURES)"
 81  function dateindex(df::D, date::DateTime, ::Val{:timeframe}) where {D<:AbstractDataFrame}
 82      (date - firstdate(df)) ÷ timeframe(df).period + 1
 83  end
 84  
 85  # TODO: move dateindex to TimeTicks
 86  @doc "Same as [`dateindex`](@ref)"
 87  function dateindex(v::V, date::DateTime) where {V<:AbstractVector}
 88      searchsortedlast(v, date)
 89  end
 90  
 91  @doc "Same [`dateindex(::AbstractVector, ::DateTime)`](@ref) but always returns the first index if the index is not found in the vector."
 92  function dateindex(v::V, date::DateTime, ::Val{:nonzero}) where {V<:AbstractVector}
 93      idx = dateindex(v, date)
 94      if iszero(idx)
 95          firstindex(v)
 96      else
 97          idx
 98      end
 99  end
100  
101  @doc "Same [`dateindex(::AbstractDataFrame, ::DateTime)`](@ref) but always returns the first index if the index is not found in the vector."
102  function dateindex(df::AbstractDataFrame, date::DateTime, ::Val{:nonzero})
103      dateindex(df.timestamp, date, Val(:nonzero))
104  end
105  dateindex(v, date, sym::Symbol) = dateindex(v, date, Val(sym))
106  
107  valueorview(df::DataFrame, idx, col::String) = getproperty(df, col)[idx]
108  valueorview(df::DataFrame, idx, col::Symbol) = getproperty(df, col)[idx]
109  valueorview(df::DataFrame, idx, cols) = @view df[idx, cols]
110  # NOTE: We should subtype an abstract dataframe...arr
111  function getindex(df::D, idx::DateTime, cols) where {D<:AbstractDataFrame}
112      v = valueorview(df, searchsortedlast(df.timestamp, idx), cols)
113      @ifdebug @assert v == getdate(df, idx, cols)
114      v
115  end
116  
117  @doc """Get the specified columns based on given date (used as index).
118  
119  $(TYPEDSIGNATURES)
120  
121  While indexing ohlcv data we have to consider the *time of arrival* of a candle.
122  In general candles collect the price *up to* its timestamp.
123  E.g. the candle at time `2000-01-01` would have tracked time from `1999-12-31T00:00:00` to `2000-01-01T00:00:00`.
124  Therefore what we return is always the *left adjacent* timestamp of the queried one.
125  """
126  function getdate(
127      df::D, idx::DateTime, cols, tf::T=timeframe!(df)
128  ) where {D<:AbstractDataFrame,T<:TimeFrame}
129      @ifdebug @assert @infertf(df) == tf
130      start = firstdate(df)
131      # start = df.timestamp[begin]
132      start <= idx || throw(ArgumentError("$idx not found in dataframe."))
133      int_idx = (idx - start) ÷ tf.period + 1
134      int_idx > size(df)[1] && throw(ArgumentError("$idx not found in dataframe."))
135      @ifdebug @assert df.timestamp[int_idx] == idx
136      valueorview(df, int_idx, cols)
137  end
138  
139  @doc """Get the date-based subset of a DataFrame.
140  
141  $(TYPEDSIGNATURES)
142  
143  Indexing by date ranges allows to query ohlcv using the timestamp column as index, assuming that the data has no missing values and is already sorted.
144  
145  Examples:
146  df[dtr"1999-.."] # Starting from 1999 up to the end
147  df[dtr"..1999-"] # From the beginning up to 1999
148  df[dtr"1999-..2000-"] # The Year 1999
149  """
150  function getdate(
151      df::D, dr::Union{DateRange,StepRange{DateTime,<:Period}}, cols, tf=timeframe!(df)
152  ) where {D<:AbstractDataFrame}
153      @ifdebug @assert @infertf(df) == tf
154      start = firstdate(df)
155      stop = lastdate(df)
156      if (!isnothing(dr.start) && dr.start < start) || (!isnothing(dr.stop) && dr.stop > stop)
157          throw(
158              ArgumentError(
159                  "Dates ($(dr.start) : $(dr.stop)) out of range for dataframe ($start : $stop).",
160              ),
161          )
162      end
163      # arithmetic indexing although slower for smaller arrays, has complexity O(1)ish
164      start_idx = if isnothing(dr.start)
165          firstindex(df.timestamp)
166      else
167          (dr.start - start) ÷ tf.period + 1
168      end
169      stop_idx = if isnothing(dr.stop)
170          lastindex(df.timestamp)
171      else
172          start_idx + (dr.stop - dr.start) ÷ tf.period
173      end
174      # start_idx = searchsortedfirst(df.timestamp, dr.start)
175      # stop_idx = start_idx + searchsortedfirst(@view(df.timestamp[start_idx+1:end]), dr.stop)
176      @ifdebug @assert df.timestamp[start_idx] == dr.start &&
177          df.timestamp[stop_idx] == dr.stop
178      @view df[start_idx:stop_idx, cols]
179  end
180  
181  function getindex(
182      df::D, dr::Union{DateRange,StepRange{DateTime,<:Period}}, cols
183  ) where {D<:AbstractDataFrame}
184      start_idx = searchsortedfirst(df.timestamp, dr.start)
185      stop_idx = searchsortedlast(df.timestamp, dr.stop)
186      v = @view df[start_idx:stop_idx, cols]
187      @ifdebug @assert v == getdate(df, dr, cols)
188      v
189  end
190  
191  function getindex(df::D, idx::DateTime) where {D<:AbstractDataFrame}
192      getindex(df, idx, Symbol.(names(df)))
193  end
194  function getindex(
195      df::D, idx::Union{DateRange,StepRange{DateTime,<:Period}}
196  ) where {D<:AbstractDataFrame}
197      getindex(df, idx, Symbol.(names(df)))
198  end
199  
200  @doc """Get the date range of a DataFrame.
201  
202  $(TYPEDSIGNATURES)
203  
204  Used to get the date range of a DataFrame `df`. It takes in the DataFrame `df`, an optional timeframe `tf` (default is the current timeframe of the DataFrame), and an optional `rightofs` parameter.
205  The `rightofs` parameter specifies the number of steps to shift the date range to the right. For example, if `rightofs` is set to 1, the date range will be shifted one step to the right. This can be useful for calculating future date ranges based on the current date range.
206  Returns the date range of the DataFrame `df` based on the specified timeframe `tf` and `rightofs` parameter.
207  """
208  function daterange(df::D, tf=timeframe(df), rightofs=1) where {D<:AbstractDataFrame}
209      DateRange(df.timestamp[begin], df.timestamp[end] + tf * rightofs, tf)
210  end
211  
212  _copysub(arr::A) where {A<:Array} = arr
213  _copysub(arr::A) where {A<:SubArray} = Array(arr)
214  
215  @doc "Replaces subarrays with arrays.
216  
217  $(TYPEDSIGNATURES)"
218  function copysubs!(
219      df::D, copyfunc=_copysub, elsefunc=Returns(nothing)
220  ) where {D<:AbstractDataFrame}
221      i = 1
222      mask = Vector{Bool}(undef, ncol(df))
223      for col in eachcol(df)
224          if (mask[i] = col isa SubArray)
225              df[!, i] = copyfunc(col)
226          else
227              elsefunc(col)
228          end
229          i += 1
230      end
231      mask
232  end
233  
234  function _make_room(df, capacity, n)
235      if n < 0
236          throw(ArgumentError("n must be non-negative"))
237      end
238      if capacity < 0
239          throw(ArgumentError("capacity must be non-negative"))
240      end
241      current_rows = nrow(df)
242      # Ensure we only delete rows if appending `n` more rows would exceed `capacity`
243      if current_rows + n > capacity
244          copysubs!(df)
245          rows_to_remove = current_rows + n - capacity
246          # Ensure we do not attempt to delete more rows than exist
247          if rows_to_remove > current_rows
248              empty!(df)
249          else
250              # Delete rows from the beginning of the dataframe
251              deleteat!(df, 1:rows_to_remove)
252          end
253      end
254  end
255  
256  @doc "Mutates `v` to `df` ensuring the dataframe never grows larger than `maxlen`.
257  
258  $(TYPEDSIGNATURES)
259  "
260  function _mutatemax!(df, v, maxlen, n, mut; cols=:union)
261      _make_room(df, maxlen, n)
262      mut(df, v; cols)
263      @ifdebug @assert nrow(df) <= maxlen
264  end
265  
266  function _tomaxlen(v, maxlen)
267      li = lastindex(v, 1)
268      from = li - min(maxlen, li) + 1
269      view(v, from:li, :)
270  end
271  
272  @doc "See [`_mutatemax!`](@ref)"
273  function appendmax!(df, v, maxlen; cols=:union)
274      _mutatemax!(df, _tomaxlen(v, maxlen), maxlen, size(v, 1), append!; cols)
275  end
276  @doc "See [`_mutatemax!`](@ref)"
277  function prependmax!(df, v, maxlen; cols=:union)
278      _mutatemax!(df, _tomaxlen(v, maxlen), maxlen, size(v, 1), prepend!; cols)
279  end
280  @doc "See [`_mutatemax!`](@ref)"
281  pushmax!(df, v, maxlen; cols=:union) = _mutatemax!(df, v, maxlen, 1, push!; cols)
282  
283  function contiguous_ts(df::DataFrame, args...; kwargs...)
284      contiguous_ts(df.timestamp, string(timeframe!(df)), args...; kwargs...)
285  end
286  
287  @doc """Get the subset of a DataFrame containing rows after a specific date.
288  
289  $(TYPEDSIGNATURES)
290  
291  This function is used to get the subset of a DataFrame `df` that contains rows after a specific date `dt`. It takes in the DataFrame `df`, the specific date `dt` as a `DateTime` object, and optional columns `cols` to include in the subset.
292  If `cols` is not specified, the function includes all columns in the subset. If `cols` is specified, only the columns listed in `cols` will be included in the subset.
293  This function returns a `DataFrameView` that contains only the rows of `df` that occur after the specified date `dt` and the specified columns `cols`.
294  """
295  function after(df::DataFrame, dt::DateTime, cols=:)
296      idx = dateindex(df, dt) + 1
297      view(df, idx:nrow(df), cols)
298  end
299  
300  @doc "Complement of [`after`](@ref)"
301  function before(df::DataFrame, dt::DateTime, cols=:)
302      idx = dateindex(df, dt) - 1
303      view(df, 1:idx, cols)
304  end
305  
306  @doc "Inserts rows in `src` to `dst`, zeroing columns not present in `dst`.
307  
308  $(TYPEDSIGNATURES)
309  "
310  function addcols!(dst, src)
311      src_cols = Set(colnames(src))
312      dst_cols = colnames(dst)
313      n = nrow(dst)
314      for col in src_cols
315          if col ∉ dst_cols
316              dst[!, col] = similar(getproperty(src, col), n)
317          end
318      end
319  end
320  
321  _fromidx(from::Integer, offset::Integer) = from + offset
322  _fromidx(from::Integer, offset) = from + round(Int, offset, RoundUp)
323  
324  @doc """Create a view of an OHLCV DataFrame starting from a specific index.
325  
326  $(TYPEDSIGNATURES)
327  
328  Used to create a view of an OHLCV DataFrame `ohlcv` starting from a specific index `from`. It takes in the OHLCV DataFrame `ohlcv`, the starting index `from` as an integer, and optional parameters `offset` and `cols`.
329  The `offset` parameter specifies the number of rows to offset the view from the starting index. The default value is 0, indicating no offset.
330  The `cols` parameter specifies the columns to include in the view. By default, all columns are included.
331  Returns a view of the original OHLCV DataFrame `ohlcv` starting from the specified index `from`, with an optional offset and specified columns.
332  """
333  function viewfrom(ohlcv, from::Integer; offset=0, cols=Colon())
334      @view ohlcv[max(1, _fromidx(from, offset)):end, cols]
335  end
336  
337  function viewfrom(ohlcv, from::DateTime; kwargs...)
338      idx = dateindex(ohlcv, from)
339      viewfrom(ohlcv, idx; kwargs...)
340  end
341  
342  function viewfrom(ohlcv, ::Nothing; kwargs...)
343      ohlcv
344  end
345  
346  @doc """Set the values of specific columns in one DataFrame from another DataFrame.
347  
348  $(TYPEDSIGNATURES)
349  
350  Used to set the values of specific columns in one DataFrame `dst` from another DataFrame `src`. It takes in the destination DataFrame `dst`, the source DataFrame `src`, the columns to set `cols`, and optional indices `idx` to specify the rows to set.
351  The `cols` parameter specifies the columns in the destination DataFrame `dst` that will be set with the corresponding values from the source DataFrame `src`.
352  The `idx` parameter specifies the indices of the rows in the destination DataFrame `dst` that will be set. By default, it sets all rows.
353  It mutates the destination DataFrame `dst` by setting the values of the specified columns `cols` with the corresponding values from the source DataFrame `src`.
354  """
355  function setcols!(dst, src, cols, idx=firstindex(dst, 1):lastindex(dst, 1))
356      data_type = eltype(src)
357      for (n, col) in enumerate(cols)
358          if !hasproperty(dst, col)
359              dst[!, col] = Vector{data_type}(1:size(dst, 1))
360          end
361          dst[idx, col] = @view src[:, n]
362      end
363  end
364  
365  export firstdate, lastdate, dateindex, daterange, viewfrom
366  export colnames, getdate, zerorow, addcols!, setcols!
367  
368  end