Cradicle Explorer

/ benchmarks / bench_fp_double_precision.nim
bench_fp_double_precision.nim
  1  # Constantine
  2  # Copyright (c) 2018-2019    Status Research & Development GmbH
  3  # Copyright (c) 2020-Present Mamy André-Ratsimbazafy
  4  # Licensed and distributed under either of
  5  #   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
  6  #   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
  7  # at your option. This file may not be copied, modified, or distributed except according to those terms.
  8  
  9  # ############################################################
 10  #
 11  #             Benchmark of finite fields
 12  #
 13  # ############################################################
 14  
 15  import
 16    # Internals
 17    ../constantine/platforms/abstractions,
 18    ../constantine/math/config/curves,
 19    ../constantine/math/arithmetic,
 20    ../constantine/math/extension_fields,
 21    # Helpers
 22    ../helpers/prng_unsafe,
 23    ./platforms,
 24    # Standard library
 25    std/[monotimes, times, strformat, strutils]
 26  
 27  var rng: RngState
 28  let seed = uint32(getTime().toUnix() and (1'i64 shl 32 - 1)) # unixTime mod 2^32
 29  rng.seed(seed)
 30  echo "bench xoshiro512** seed: ", seed
 31  
 32  # warmup
 33  proc warmup*() =
 34    # Warmup - make sure cpu is on max perf
 35    let start = cpuTime()
 36    var foo = 123
 37    for i in 0 ..< 300_000_000:
 38      foo += i*i mod 456
 39      foo = foo mod 789
 40  
 41    # Compiler shouldn't optimize away the results as cpuTime rely on sideeffects
 42    let stop = cpuTime()
 43    echo &"Warmup: {stop - start:>4.4f} s, result {foo} (displayed to avoid compiler optimizing warmup away)\n"
 44  
 45  warmup()
 46  
 47  when defined(gcc):
 48    echo "\nCompiled with GCC"
 49  elif defined(clang):
 50    echo "\nCompiled with Clang"
 51  elif defined(vcc):
 52    echo "\nCompiled with MSVC"
 53  elif defined(icc):
 54    echo "\nCompiled with ICC"
 55  else:
 56    echo "\nCompiled with an unknown compiler"
 57  
 58  echo "Optimization level => "
 59  echo "  no optimization: ", not defined(release)
 60  echo "  release: ", defined(release)
 61  echo "  danger: ", defined(danger)
 62  echo "  inline assembly: ", UseASM_X86_64
 63  
 64  when CTT_32:
 65    echo "⚠️ Warning: using Constantine with 32-bit limbs"
 66  else:
 67    echo "Using Constantine with 64-bit limbs"
 68  
 69  when SupportsCPUName:
 70    echo "Running on ", cpuName(), ""
 71  
 72  when SupportsGetTicks:
 73    echo "\n⚠️ Cycles measurements are approximate and use the CPU nominal clock: Turbo-Boost and overclocking will skew them."
 74    echo "i.e. a 20% overclock will be about 20% off (assuming no dynamic frequency scaling)"
 75  
 76  echo "\n=================================================================================================================\n"
 77  
 78  proc separator*() =
 79    echo "-".repeat(145)
 80  
 81  proc report(op, field: string, start, stop: MonoTime, startClk, stopClk: int64, iters: int) =
 82    let ns = inNanoseconds((stop-start) div iters)
 83    let throughput = 1e9 / float64(ns)
 84    when SupportsGetTicks:
 85      echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op     {(stopClk - startClk) div iters:>9} CPU cycles (approx)"
 86    else:
 87      echo &"{op:<28} {field:<40} {throughput:>15.3f} ops/s     {ns:>9} ns/op"
 88  
 89  proc notes*() =
 90    echo "Notes:"
 91    echo "  - Compilers:"
 92    echo "    Compilers are severely limited on multiprecision arithmetic."
 93    echo "    Constantine compile-time assembler is used by default (nimble bench_fp)."
 94    echo "    GCC is significantly slower than Clang on multiprecision arithmetic due to catastrophic handling of carries."
 95    echo "    GCC also seems to have issues with large temporaries and register spilling."
 96    echo "    This is somewhat alleviated by Constantine compile-time assembler."
 97    echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc\" or \"nimble bench_ec_g1_clang\"."
 98    echo "    Bench on specific compiler with assembler: \"nimble bench_ec_g1_gcc_noasm\" or \"nimble bench_ec_g1_clang_noasm\"."
 99    echo "  - The simplest operations might be optimized away by the compiler."
100  
101  template bench(op: string, desc: string, iters: int, body: untyped): untyped =
102    let start = getMonotime()
103    when SupportsGetTicks:
104      let startClk = getTicks()
105    for _ in 0 ..< iters:
106      body
107    when SupportsGetTicks:
108      let stopClk = getTicks()
109    let stop = getMonotime()
110  
111    when not SupportsGetTicks:
112      let startClk = -1'i64
113      let stopClk = -1'i64
114  
115    report(op, desc, start, stop, startClk, stopClk, iters)
116  
117  func random_unsafe(rng: var RngState, a: var FpDbl, Base: typedesc) =
118    ## Initialize a standalone Double-Width field element
119    ## we don't reduce it modulo p², this is only used for benchmark
120    let aHi = rng.random_unsafe(Base)
121    let aLo = rng.random_unsafe(Base)
122    for i in 0 ..< aLo.mres.limbs.len:
123      a.limbs2x[i] = aLo.mres.limbs[i]
124    for i in 0 ..< aHi.mres.limbs.len:
125      a.limbs2x[aLo.mres.limbs.len+i] = aHi.mres.limbs[i]
126  
127  proc sumUnr(T: typedesc, iters: int) =
128    var r: T
129    let a = rng.random_unsafe(T)
130    let b = rng.random_unsafe(T)
131    bench("Addition unreduced", $T, iters):
132      r.sumUnr(a, b)
133  
134  proc sum(T: typedesc, iters: int) =
135    var r: T
136    let a = rng.random_unsafe(T)
137    let b = rng.random_unsafe(T)
138    bench("Addition", $T, iters):
139      r.sum(a, b)
140  
141  proc diffUnr(T: typedesc, iters: int) =
142    var r: T
143    let a = rng.random_unsafe(T)
144    let b = rng.random_unsafe(T)
145    bench("Substraction unreduced", $T, iters):
146      r.diffUnr(a, b)
147  
148  proc diff(T: typedesc, iters: int) =
149    var r: T
150    let a = rng.random_unsafe(T)
151    let b = rng.random_unsafe(T)
152    bench("Substraction", $T, iters):
153      r.diff(a, b)
154  
155  proc neg(T: typedesc, iters: int) =
156    var r: T
157    let a = rng.random_unsafe(T)
158    bench("Negation", $T, iters):
159      r.neg(a)
160  
161  proc sum2xUnreduce(T: typedesc, iters: int) =
162    var r, a, b: doublePrec(T)
163    rng.random_unsafe(r, T)
164    rng.random_unsafe(a, T)
165    rng.random_unsafe(b, T)
166    bench("Addition 2x unreduced", $doublePrec(T), iters):
167      r.sum2xUnr(a, b)
168  
169  proc sum2x(T: typedesc, iters: int) =
170    var r, a, b: doublePrec(T)
171    rng.random_unsafe(r, T)
172    rng.random_unsafe(a, T)
173    rng.random_unsafe(b, T)
174    bench("Addition 2x reduced", $doublePrec(T), iters):
175      r.sum2xMod(a, b)
176  
177  proc diff2xUnreduce(T: typedesc, iters: int) =
178    var r, a, b: doublePrec(T)
179    rng.random_unsafe(r, T)
180    rng.random_unsafe(a, T)
181    rng.random_unsafe(b, T)
182    bench("Substraction 2x unreduced", $doublePrec(T), iters):
183      r.diff2xUnr(a, b)
184  
185  proc diff2x(T: typedesc, iters: int) =
186    var r, a, b: doublePrec(T)
187    rng.random_unsafe(r, T)
188    rng.random_unsafe(a, T)
189    rng.random_unsafe(b, T)
190    bench("Substraction 2x reduced", $doublePrec(T), iters):
191      r.diff2xMod(a, b)
192  
193  proc neg2x(T: typedesc, iters: int) =
194    var r, a: doublePrec(T)
195    rng.random_unsafe(a, T)
196    bench("Negation 2x reduced", $doublePrec(T), iters):
197      r.neg2xMod(a)
198  
199  proc prod2xBench*(rLen, aLen, bLen: static int, iters: int) =
200    var r: BigInt[rLen]
201    let a = rng.random_unsafe(BigInt[aLen])
202    let b = rng.random_unsafe(BigInt[bLen])
203    bench("Multiplication 2x", $rLen & " <- " & $aLen & " x " & $bLen, iters):
204      r.prod(a, b)
205  
206  proc square2xBench*(rLen, aLen: static int, iters: int) =
207    var r: BigInt[rLen]
208    let a = rng.random_unsafe(BigInt[aLen])
209    bench("Squaring 2x", $rLen & " <- " & $aLen & "²", iters):
210      r.square(a)
211  
212  proc reduce2x*(T: typedesc, iters: int) =
213    var r: T
214    var t: doublePrec(T)
215    rng.random_unsafe(t, T)
216  
217    bench("Redc 2x", $T & " <- " & $doublePrec(T), iters):
218      r.redc2x(t)
219  
220  proc reduce2xViaDivision*(T: typedesc, iters: int) =
221  
222    const bits2x = 2 * T.C.getCurveBitWidth()
223    var r: matchingBigInt(T.C)
224    let t = rng.random_unsafe(BigInt[bits2x])
225  
226    bench("Reduction via division", $T & " <- " & $doublePrec(T), iters):
227      r.reduce(t, T.fieldMod())
228  
229  proc main() =
230    separator()
231    sum(Fp[BLS12_381], iters = 10_000_000)
232    sumUnr(Fp[BLS12_381], iters = 10_000_000)
233    diff(Fp[BLS12_381], iters = 10_000_000)
234    diffUnr(Fp[BLS12_381], iters = 10_000_000)
235    neg(Fp[BLS12_381], iters = 10_000_000)
236    separator()
237    sum2x(Fp[BLS12_381], iters = 10_000_000)
238    sum2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
239    diff2x(Fp[BLS12_381], iters = 10_000_000)
240    diff2xUnreduce(Fp[BLS12_381], iters = 10_000_000)
241    neg2x(Fp[BLS12_381], iters = 10_000_000)
242    separator()
243    prod2xBench(512, 256, 256, iters = 10_000_000)
244    prod2xBench(768, 384, 384, iters = 10_000_000)
245    square2xBench(512, 256, iters = 10_000_000)
246    square2xBench(768, 384, iters = 10_000_000)
247    reduce2x(Fp[BN254_Snarks], iters = 10_000_000)
248    reduce2x(Fp[BLS12_381], iters = 10_000_000)
249    reduce2xViaDivision(Fp[BN254_Snarks], iters = 10_000)
250    reduce2xViaDivision(Fp[BLS12_381], iters = 10_000)
251    separator()
252  
253  main()
254  notes()