/ core.c
core.c
  1  // Device- and environment-neutral core matrix-driving functionality.
  2  // See notes near top of arch.h regarding assumptions of hardware
  3  // "common ground." If you find yourself doing an "#ifdef ARDUINO" or
  4  // "#ifdef _SAMD21_" in this file, STOP. Idea is that the code in this
  5  // file is neutral and portable (within aforementioned assumptions).
  6  // Nonportable elements should appear in arch.h. If arch.h functionality
  7  // is lacking, extend it there, do not go making device- or environment-
  8  // specific cases within this file.
  9  
 10  // Function names are intentionally a little obtuse, idea is that one writes
 11  // a more sensible wrapper around this for specific environments (e.g. the
 12  // Arduino stuff in Adafruit_Protomatter.cpp). The "_PM_" prefix on most
 13  // things hopefully makes function and variable name collisions much less
 14  // likely with one's own code.
 15  
 16  #include "core.h" // enums and structs
 17  #include "arch.h" // Do NOT include this in any other source files
 18  
 19  // Overall matrix refresh rate (frames/second) is a function of matrix width
 20  // and chain length, number of address lines, number of bit planes, CPU speed
 21  // and whether or not a GPIO toggle register is available. There is no "this
 22  // will run at X-frames-per-second" constant figure. You typically just have
 23  // to try it out and perhaps trade off some bit planes for refresh rate until
 24  // the image looks good and stable. Anything over 100 Hz is usually passable,
 25  // around 250 Hz is where things firm up. And while this could proceed higher
 26  // in some situations, the tradeoff is that faster rates use progressively
 27  // more CPU time (because it's timer interrupt based and not using DMA or
 28  // special peripherals). So a throttle is set here, an approximate maximum
 29  // frame rate which the software will attempt to avoid exceeding (but may
 30  // refresh slower than this, and in many cases will...just need to set an
 31  // upper limit to avoid excessive CPU load). An incredibly long comment block
 32  // for a single constant, thank you for coming to my TED talk!
 33  #define _PM_MAX_REFRESH_HZ 250
 34  
 35  // Time (in microseconds) to pause following any change in address lines
 36  // (individually or collectively). Some matrices respond slowly there...
 37  // must pause on change for matrix to catch up. Defined here (rather than
 38  // arch.h) because it's not architecture-specific.
 39  #define _PM_ROW_DELAY 8
 40  
 41  // These are the lowest-level functions for issing data to matrices.
 42  // There are three versions because it depends on how the six RGB data bits
 43  // (and clock bit) are arranged within a 32-bit PORT register. If all six
 44  // (seven) fit within one byte or word of the PORT, the library's memory
 45  // use (and corresponding data-issuing function) change. This will also have
 46  // an impact on parallel chains in the future, where the number of concurrent
 47  // RGB data bits isn't always six, but some multiple thereof (i.e. up to five
 48  // parallel outputs -- 30 RGB bits + clock -- on a 32-bit PORT, though that's
 49  // largely hypothetical as the chance of finding a PORT with that many bits
 50  // exposed and NOT interfering with other peripherals on a board is highly
 51  // improbable. But I could see four happening, maybe on a Grand Central or
 52  // other kitchen-sink board.
 53  static void blast_byte(Protomatter_core *core, uint8_t *data);
 54  static void blast_word(Protomatter_core *core, uint16_t *data);
 55  static void blast_long(Protomatter_core *core, uint32_t *data);
 56  
 57  // Validate and populate vital elements of core structure.
 58  // Does NOT allocate core struct -- calling function must provide that.
 59  // (In the Arduino C++ library, it’s part of the Protomatter class.)
 60  ProtomatterStatus _PM_init(Protomatter_core *core,
 61    uint16_t bitWidth, uint8_t bitDepth,
 62    uint8_t rgbCount, uint8_t *rgbList,
 63    uint8_t addrCount, uint8_t *addrList,
 64    uint8_t clockPin, uint8_t latchPin, uint8_t oePin,
 65    bool doubleBuffer, void *timer) {
 66      if(!core) return PROTOMATTER_ERR_ARG;
 67  
 68      if(rgbCount  > 5) rgbCount  = 5; // Max 5 in parallel (32-bit PORT)
 69      if(addrCount > 5) addrCount = 5; // Max 5 address lines (A-E)
 70      // bitDepth is NOT constrained here, handle in calling function
 71      // (varies with implementation, e.g. GFX lib is max 6 bitplanes,
 72      // but might be more or less elsewhere)
 73  
 74      // If NULL timer was passed in (the default case for the constructor),
 75      // use default value from arch.h. For example, in the Arduino case it's
 76      // tied to TC4 specifically.
 77      if(timer == NULL) timer = _PM_TIMER_DEFAULT;
 78  
 79      core->timer           = timer;
 80      core->width           = bitWidth; // Total matrix chain length in bits
 81      core->numPlanes       = bitDepth;
 82      core->parallel        = rgbCount;
 83      core->numAddressLines = addrCount;
 84      core->clockPin        = clockPin;
 85      core->latch.pin       = latchPin;
 86      core->oe.pin          = oePin;
 87      core->doubleBuffer    = doubleBuffer;
 88      core->addr            = NULL;
 89      core->screenData      = NULL;
 90  
 91      // Make a copy of the rgbList and addrList tables in case they're
 92      // passed from local vars on the stack or some other non-persistent
 93      // source. screenData is NOT allocated here because data size (byte,
 94      // word, long) is not known until the begin function evaluates all
 95      // the pin bitmasks.
 96  
 97      rgbCount *= 6; // Convert parallel count to pin count
 98      if((core->rgbPins = (uint8_t *)malloc(rgbCount * sizeof(uint8_t)))) {
 99          if((core->addr = (_PM_pin *)malloc(addrCount * sizeof(_PM_pin)))) {
100              memcpy(core->rgbPins, rgbList, rgbCount * sizeof(uint8_t));
101              for(uint8_t i=0; i<addrCount; i++) {
102                  core->addr[i].pin = addrList[i];
103              }
104              return PROTOMATTER_OK;
105          }
106          free(core->rgbPins);
107          core->rgbPins = NULL;
108      }
109      return PROTOMATTER_ERR_MALLOC;
110  }
111  
112  // Allocate display buffers and populate additional elements.
113  ProtomatterStatus _PM_begin(Protomatter_core *core) {
114      if(!core) return PROTOMATTER_ERR_ARG;
115  
116      if(!core->rgbPins) { // NULL if copy failed to allocate
117          return PROTOMATTER_ERR_MALLOC;
118      }
119  
120      // Verify that rgbPins and clockPin are all on the same PORT. If not,
121      // return an error. Pin list is not freed; please call dealloc function.
122      // Also get bitmask of which bits within 32-bit PORT register are
123      // referenced.
124      uint8_t *port = (uint8_t *)_PM_portOutRegister(core->clockPin);
125  #if defined(_PM_portToggleRegister)
126      // If a bit-toggle register is present, the clock pin is included
127      // in determining which bytes of the PORT register are used (and thus
128      // the data storage efficiency).
129      uint32_t bitMask = _PM_portBitMask(core->clockPin);
130  #else
131      // If no bit-toggle register, clock pin can be on any bit, doesn't
132      // affect storage efficiency.
133      uint32_t bitMask = 0;
134  #endif
135  
136      for(uint8_t i=0; i<core->parallel * 6; i++) {
137          uint8_t *p2 = (uint8_t *)_PM_portOutRegister(core->rgbPins[i]);
138          if(p2 != port) {
139              return PROTOMATTER_ERR_PINS;
140          }
141          bitMask |= _PM_portBitMask(core->rgbPins[i]);
142      }
143  
144      // RGB + clock are on same port, we can proceed...
145  
146      // Determine data type for internal representation. If all the data
147      // bitmasks (and possibly clock bitmask, depending whether toggle-bits
148      // register is present) are in the same byte, this can be stored more
149      // compact than if they're spread across a word or long.
150      uint8_t byteMask = 0;
151      if(bitMask & 0xFF000000) byteMask |= 0b1000;
152      if(bitMask & 0x00FF0000) byteMask |= 0b0100;
153      if(bitMask & 0x0000FF00) byteMask |= 0b0010;
154      if(bitMask & 0x000000FF) byteMask |= 0b0001;
155      switch(byteMask) {
156        case 0b0001:                 // If all PORT bits are in the same byte...
157        case 0b0010:
158        case 0b0100:
159        case 0b1000:
160          core->bytesPerElement = 1; // Use 8-bit PORT accesses.
161          break;
162        case 0b0011:                 // If all PORT bits in upper/lower word...
163        case 0b1100:
164          core->bytesPerElement = 2; // Use 16-bit PORT accesses.
165          // Although some devices might tolerate unaligned 16-bit accesses
166          // ('middle' word of 32-bit PORT), that is NOT handled here.
167          // It's a portability liability.
168          break;
169        default:                     // Any other situation...
170          core->bytesPerElement = 4; // Use 32-bit PORT accesses.
171          break;
172      }
173  
174      // Planning for screen data allocation...
175      core->numRowPairs    = 1 << core->numAddressLines;
176      uint8_t  chunks      = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
177      uint16_t columns     = chunks * _PM_chunkSize; // Padded matrix width
178      uint32_t screenBytes = columns * core->numRowPairs * core->numPlanes *
179        core->bytesPerElement;
180  
181      core->bufferSize = screenBytes;    // Bytes per matrix buffer (1 or 2)
182      if(core->doubleBuffer) screenBytes *= 2; // Total for matrix buffer(s)
183      uint32_t rgbMaskBytes = core->parallel * 6 * core->bytesPerElement;
184  
185      // Allocate matrix buffer(s). Don't worry about the return type...
186      // though we might be using words or longs for certain pin configs,
187      // malloc() by definition always aligns to the longest type.
188      if(!(core->screenData = (uint8_t *)malloc(screenBytes + rgbMaskBytes))) {
189          return PROTOMATTER_ERR_MALLOC;
190      }
191  
192      // rgbMask data follows the matrix buffer(s)
193      core->rgbMask = core->screenData + screenBytes;
194  
195  #if !defined(_PM_portToggleRegister)
196      // Clear entire screenData buffer so there's no cruft in any pad bytes
197      // (if using toggle register, each is set to clockMask below instead).
198      memset(core->screenData, 0, screenBytes);
199  #endif
200  
201      // Figure out clockMask and rgbAndClockMask, clear matrix buffers
202      if(core->bytesPerElement == 1) {
203          core->portOffset = _PM_byteOffset(core->rgbPins[0]);
204  #if defined(_PM_portToggleRegister)
205          // Clock and rgbAndClockMask are 8-bit values
206          core->clockMask = _PM_portBitMask(core->clockPin) >>
207            (core->portOffset * 8);
208          core->rgbAndClockMask = (bitMask >> (core->portOffset * 8)) |
209            core->clockMask;
210          memset(core->screenData, core->clockMask, screenBytes);
211  #else
212          // Clock and rgbAndClockMask are 32-bit values
213          core->clockMask       = _PM_portBitMask(core->clockPin);
214          core->rgbAndClockMask = bitMask | core->clockMask;
215  #endif
216          for(uint8_t i=0; i<core->parallel * 6; i++) {
217              ((uint8_t *)core->rgbMask)[i] = // Pin bitmasks are 8-bit
218                _PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 8);
219          }
220      } else if(core->bytesPerElement == 2) {
221          core->portOffset = _PM_wordOffset(core->rgbPins[0]);
222  #if defined(_PM_portToggleRegister)
223          // Clock and rgbAndClockMask are 16-bit values
224          core->clockMask = _PM_portBitMask(core->clockPin) >>
225            (core->portOffset * 16);
226          core->rgbAndClockMask = (bitMask >> (core->portOffset * 16)) |
227            core->clockMask;
228          uint32_t elements = screenBytes / 2;
229          for(uint32_t i=0; i<elements; i++) {
230              ((uint16_t *)core->screenData)[i] = core->clockMask;
231          }
232  #else
233          // Clock and rgbAndClockMask are 32-bit values
234          core->clockMask       = _PM_portBitMask(core->clockPin);
235          core->rgbAndClockMask = bitMask | core->clockMask;
236  #endif
237          for(uint8_t i=0; i<core->parallel * 6; i++) {
238              ((uint16_t *)core->rgbMask)[i] = // Pin bitmasks are 16-bit
239                _PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 16);
240          }
241      } else {
242          core->portOffset      = 0;
243          core->clockMask       = _PM_portBitMask(core->clockPin);
244          core->rgbAndClockMask = bitMask | core->clockMask;
245  #if defined(_PM_portToggleRegister)
246          uint32_t elements = screenBytes / 4;
247          for(uint32_t i=0; i<elements; i++) {
248              ((uint32_t *)core->screenData)[i] = core->clockMask;
249          }
250  #endif
251          for(uint8_t i=0; i<core->parallel * 6; i++) {
252              ((uint32_t *)core->rgbMask)[i] = // Pin bitmasks are 32-bit
253                _PM_portBitMask(core->rgbPins[i]);
254          }
255      }
256  
257      // Estimate minimum bitplane #0 period for _PM_MAX_REFRESH_HZ rate.
258      uint32_t minPeriodPerFrame = _PM_timerFreq / _PM_MAX_REFRESH_HZ;
259      uint32_t minPeriodPerLine  = minPeriodPerFrame / core->numRowPairs;
260      core->minPeriod = minPeriodPerLine / ((1 << core->numPlanes) - 1);
261      if(core->minPeriod < _PM_minMinPeriod) {
262          core->minPeriod = _PM_minMinPeriod;
263      }
264      // Actual frame rate may be lower than this...it's only an estimate
265      // and does not factor in things like address line selection delays
266      // or interrupt overhead. That's OK, just don't want to exceed this
267      // rate, as it'll eat all the CPU cycles.
268      // Make a wild guess for the initial bit-zero interval. It's okay
269      // that this is off, code adapts to actual timer results pretty quick.
270  
271      core->bitZeroPeriod = core->width * 5; // Initial guesstimate
272  
273      core->activeBuffer  = 0;
274  
275      // Configure pins as outputs and initialize their states.
276  
277      core->latch.setReg   = _PM_portSetRegister(core->latch.pin);
278      core->latch.clearReg = _PM_portClearRegister(core->latch.pin);
279      core->latch.bit      = _PM_portBitMask(core->latch.pin);
280      core->oe.setReg      = _PM_portSetRegister(core->oe.pin);
281      core->oe.clearReg    = _PM_portClearRegister(core->oe.pin);
282      core->oe.bit         = _PM_portBitMask(core->oe.pin);
283  
284      _PM_pinOutput(core->clockPin);
285      _PM_pinLow(core->clockPin);  // Init clock LOW
286      _PM_pinOutput(core->latch.pin);
287      _PM_pinLow(core->latch.pin); // Init latch LOW
288      _PM_pinOutput(core->oe.pin);
289      _PM_pinHigh(core->oe.pin);   // Init OE HIGH (disable output)
290  
291      for(uint8_t i=0; i<core->parallel * 6; i++) {
292          _PM_pinOutput(core->rgbPins[i]);
293          _PM_pinLow(core->rgbPins[i]);
294      }
295  #if defined(_PM_portToggleRegister)
296      core->addrPortToggle = _PM_portToggleRegister(core->addr[0].pin);
297      core->singleAddrPort = 1;
298  #endif
299      for(uint8_t line=0,bit=1; line<core->numAddressLines; line++, bit<<=1) {
300          core->addr[line].setReg =
301            _PM_portSetRegister(core->addr[line].pin);
302          core->addr[line].clearReg =
303            _PM_portClearRegister(core->addr[line].pin);
304          core->addr[line].bit =
305            _PM_portBitMask(core->addr[line].pin);
306          _PM_pinOutput(core->addr[line].pin);
307          if(core->prevRow & bit) {
308              _PM_pinHigh(core->addr[line].pin);
309          } else {
310              _PM_pinLow(core->addr[line].pin);
311          }
312  #if defined(_PM_portToggleRegister)
313          // If address pin on different port than addr 0, no singleAddrPort.
314          if(_PM_portToggleRegister(core->addr[line].pin) !=
315            core->addrPortToggle) {
316              core->singleAddrPort = 0;
317          }
318  #endif
319      }
320  
321      // Get pointers to bit set and clear registers (and toggle, if present)
322      core->setReg    = (uint8_t *)_PM_portSetRegister(core->clockPin);
323      core->clearReg  = (uint8_t *)_PM_portClearRegister(core->clockPin);
324  #if defined(_PM_portToggleRegister)
325      core->toggleReg = (uint8_t *)_PM_portToggleRegister(core->clockPin);
326  #endif
327  
328      // Reset plane/row counters, config and start timer
329      _PM_resume(core);
330  
331      return PROTOMATTER_OK;
332  }
333  
334  // Disable (but do not deallocate) a Protomatter matrix. Disables matrix by
335  // setting OE pin HIGH and writing all-zero data to matrix shift registers,
336  // so it won't halt with lit LEDs.
337  void _PM_stop(Protomatter_core *core) {
338      if((core)) {
339          while(core->swapBuffers);        // Wait for any pending buffer swap
340          _PM_timerStop(core->timer);      // Halt timer
341          *core->oe.setReg = core->oe.bit; // Set OE HIGH (disable output)
342          // So, in PRINCIPLE, setting OE high would be sufficient...
343          // but in case that pin is shared with another function such
344          // as the onloard LED (which pulses during bootloading) let's
345          // also clear out the matrix shift registers for good measure.
346          // Set all RGB pins LOW...
347          for(uint8_t i=0; i<core->parallel * 6; i++) {
348              _PM_pinLow(core->rgbPins[i]);
349          }
350          // Clock out bits (just need to toggle clock with RGBs held low)
351          for(uint32_t i=0; i<core->width; i++) {
352              _PM_pinHigh(core->clockPin);
353              _PM_clockHoldHigh;
354              _PM_pinLow(core->clockPin);
355              _PM_clockHoldLow;
356          }
357          // Latch data
358          *core->latch.setReg   = core->latch.bit;
359          *core->latch.clearReg = core->latch.bit;
360      }
361  }
362  
363  void _PM_resume(Protomatter_core *core) {
364      if((core)) {
365          // Init plane & row to max values so they roll over on 1st interrupt
366          core->plane       = core->numPlanes   - 1;
367          core->row         = core->numRowPairs - 1;
368          core->prevRow     = (core->numRowPairs > 1) ? (core->row - 1) : 1;
369          core->swapBuffers = 0;
370          core->frameCount  = 0;
371  
372          _PM_timerInit(core->timer);        // Configure timer
373          _PM_timerStart(core->timer, 1000); // Start timer
374      }
375  }
376  
377  // Free memory associated with core structure. Does NOT dealloc struct.
378  void _PM_free(Protomatter_core *core) {
379      if((core)) {
380          _PM_stop(core);
381          // TO DO: Set all pins back to inputs here?
382          if(core->screenData) free(core->screenData);
383          if(core->addr)       free(core->addr);
384          if(core->rgbPins) {
385              free(core->rgbPins);
386              core->rgbPins = NULL;
387          }
388      }
389  }
390  
391  
392  // ISR function (in arch.h) calls this function which it extern'd.
393  void _PM_row_handler(Protomatter_core *core) {
394  
395      *core->oe.setReg = core->oe.bit; // Disable LED output
396  
397      *core->latch.setReg   = core->latch.bit; // Latch data from PRIOR pass
398      // Stop timer, save count value at stop
399      uint32_t elapsed = _PM_timerStop(core->timer);
400      uint8_t prevPlane = core->plane; // Save that plane # for later timing
401      *core->latch.clearReg = core->latch.bit; // (split to add a few cycles)
402  
403      // If plane 0 just finished being displayed (plane 1 was loaded on prior
404      // pass, or there's only one plane...I know, it's confusing), take note
405      // of the elapsed timer value, for subsequent bitplane timing (each
406      // plane period is double the previous). Value is filtered slightly to
407      // avoid jitter.
408      if((prevPlane == 1) || (core->numPlanes == 1)) {
409          core->bitZeroPeriod = ((core->bitZeroPeriod * 7) + elapsed) / 8;
410          if(core->bitZeroPeriod < core->minPeriod) {
411              core->bitZeroPeriod = core->minPeriod;
412          }
413      }
414  
415      if(prevPlane == 0) { // Plane 0 just finished loading
416  #if defined(_PM_portToggleRegister)
417          // If all address lines are on a single PORT (and bit toggle is
418          // available), do address line change all at once. Even doing all
419          // this math takes MUCH less time than the delays required when
420          // doing line-by-line changes.
421          if(core->singleAddrPort) {
422              // Make bitmasks of prior and new row bits
423              uint32_t priorBits = 0, newBits = 0;
424              for(uint8_t line=0,bit=1; line<core->numAddressLines;
425                line++, bit<<=1) {
426                  if(core->row & bit) {
427                      newBits |= core->addr[line].bit;
428                  }
429                  if(core->prevRow & bit) {
430                      priorBits |= core->addr[line].bit;
431                  }
432              }
433              *core->addrPortToggle = newBits ^ priorBits;
434              _PM_delayMicroseconds(_PM_ROW_DELAY);
435          } else {
436  #endif
437              // Configure row address lines individually, making changes
438              // (with delays) only where necessary.
439              for(uint8_t line=0,bit=1; line<core->numAddressLines;
440                line++, bit<<=1) {
441                  if((core->row & bit) != (core->prevRow & bit)) {
442                      if(core->row & bit) { // Set addr line high
443                           *core->addr[line].setReg = core->addr[line].bit;
444                      } else { // Set addr line low
445                           *core->addr[line].clearReg = core->addr[line].bit;
446                      }
447                      _PM_delayMicroseconds(_PM_ROW_DELAY);
448                  }
449              }
450  #if defined(_PM_portToggleRegister)
451          }
452  #endif
453          core->prevRow = core->row;
454      }
455  
456      // Advance bitplane index and/or row as necessary
457      if(++core->plane >= core->numPlanes) {     // Next data bitplane, or
458          core->plane = 0;                       // roll over bitplane to start
459          if(++core->row >= core->numRowPairs) { // Next row, or
460              core->row = 0;                     // roll over row to start
461              // Switch matrix buffers if due (only if double-buffered)
462              if(core->swapBuffers) {
463                  core->activeBuffer = 1 - core->activeBuffer;
464                  core->swapBuffers  = 0;        // Swapped!
465              }
466              core->frameCount++;
467          }
468      }
469  
470      // 'plane' now is index of data to issue, NOT data to display.
471      // 'prevPlane' is the previously-loaded data, which gets displayed
472      // now while the next plane data is loaded.
473  
474      // Set timer and enable LED output for data loaded on PRIOR pass:
475      _PM_timerStart(core->timer, core->bitZeroPeriod << prevPlane);
476      *core->oe.clearReg = core->oe.bit; // Enable LED output
477  
478      uint32_t elementsPerLine = _PM_chunkSize *
479          ((core->width + (_PM_chunkSize - 1)) / _PM_chunkSize);
480      uint32_t srcOffset = elementsPerLine *
481        (core->numPlanes * core->row + core->plane) * core->bytesPerElement;
482      if(core->doubleBuffer) {
483          srcOffset += core->bufferSize * core->activeBuffer;
484      }
485  
486      if(core->bytesPerElement == 1) {
487          blast_byte(core, (uint8_t *)(core->screenData + srcOffset));
488      } else if(core->bytesPerElement == 2) {
489          blast_word(core, (uint16_t *)(core->screenData + srcOffset));
490      } else {
491          blast_long(core, (uint32_t *)(core->screenData + srcOffset));
492      }
493  
494      // 'plane' data is now loaded, will be shown on NEXT pass
495  }
496  
497  // Innermost data-stuffing loop functions
498  
499  // The presence of a bit-toggle register can make the data-stuffing loop a
500  // fair bit faster (2 PORT accesses per column vs 3). But ironically, some
501  // devices (e.g. SAMD51) can outpace the matrix max CLK speed, so we slow
502  // them down with a few NOPs. These are defined in arch.h as needed.
503  // _PM_clockHoldLow is whatever code necessary to delay the clock rise
504  // after data is placed on the PORT. _PM_clockHoldHigh is code for delay
505  // before setting the clock back low. If undefined, nothing goes there.
506  
507  #if defined(_PM_portToggleRegister)
508    #define PEW \
509      *toggle  = *data++; /* Toggle in new data + toggle clock low */ \
510      _PM_clockHoldLow; \
511      *toggle  =  clock;  /* Toggle clock high */ \
512      _PM_clockHoldHigh;
513  #else
514    #define PEW \
515      *set     = *data++;   /* Set RGB data high */ \
516      _PM_clockHoldLow; \
517      *set32   =  clock;    /* Set clock high */ \
518      _PM_clockHoldHigh; \
519      *clear32 =  rgbclock; /* Clear RGB data + clock */
520  #endif
521  
522  #if _PM_chunkSize == 1
523    #define PEW_UNROLL PEW
524  #elif _PM_chunkSize == 8
525    #define PEW_UNROLL PEW PEW PEW PEW PEW PEW PEW PEW
526  #elif _PM_chunkSize == 16
527    #define PEW_UNROLL \
528      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW
529  #elif _PM_chunkSize == 32
530    #define PEW_UNROLL \
531      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
532      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW
533  #elif _PM_chunkSize == 64
534    #define PEW_UNROLL \
535      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
536      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
537      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \
538      PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW
539  #else
540    #error "Unimplemented _PM_chunkSize value"
541  #endif
542  
543  // There are THREE COPIES of the following function -- one each for byte,
544  // word and long. If changes are made in any one of them, the others MUST
545  // be updated to match! (Decided against using macro tricks for the
546  // function, too often ends in disaster...but must be vigilant in the
547  // three-function maintenance then.)
548  
549  static void blast_byte(Protomatter_core *core, uint8_t *data) {
550  #if defined(_PM_portToggleRegister)
551      // If here, it was established in begin() that the RGB data bits and
552      // clock are all within the same byte of a PORT register, else we'd be
553      // in the word- or long-blasting functions now. So we just need an
554      // 8-bit pointer to the PORT.
555      volatile uint8_t *toggle = (volatile uint8_t *)core->toggleReg +
556          core->portOffset;
557  #else
558      // No-toggle version is a little different. If here, RGB data is all
559      // in one byte of PORT register, clock can be any bit in 32-bit PORT.
560      volatile uint8_t  *set;     // For RGB data set
561      volatile uint32_t *set32;   // For clock set
562      volatile uint32_t *clear32; // For RGB data + clock clear
563      set     = (volatile uint8_t *)core->setReg + portOffset;
564      set32   = (volatile uint32_t *)core->setReg;
565      clear32 = (volatile uint32_t *)core->clearReg;
566      uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit
567  #endif
568      uint32_t clock  = core->clockMask; // Clock bit
569      uint8_t  chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
570  
571      // PORT has already been initialized with RGB data + clock bits
572      // all LOW, so we don't need to initialize that state here.
573  
574      while(chunks--) {
575          PEW_UNROLL // _PM_chunkSize RGB+clock writes
576      }
577  
578  #if defined(_PM_portToggleRegister)
579      // Want the PORT left with RGB data and clock LOW on function exit
580      // (so it's easier to see on 'scope, and to prime it for the next call).
581      // This is implicit in the no-toggle case (due to how the PEW macro
582      // works), but toggle case requires explicitly clearing those bits.
583      // rgbAndClockMask is an 8-bit value when toggling, hence offset here.
584      *((volatile uint8_t *)core->clearReg + core->portOffset) =
585        core->rgbAndClockMask;
586  #endif
587  }
588  
589  static void blast_word(Protomatter_core *core, uint16_t *data) {
590  #if defined(_PM_portToggleRegister)
591      // See notes above -- except now 16-bit word in PORT.
592      volatile uint16_t *toggle = (volatile uint16_t *)core->toggleReg +
593          core->portOffset;
594  #else
595      volatile uint16_t *set;     // For RGB data set
596      volatile uint32_t *set32;   // For clock set
597      volatile uint32_t *clear32; // For RGB data + clock clear
598      set     = (volatile uint16_t *)core->setReg + core->portOffset;
599      set32   = (volatile uint32_t *)core->setReg;
600      clear32 = (volatile uint32_t *)core->clearReg;
601      uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit
602  #endif
603      uint32_t clock  = core->clockMask; // Clock bit
604      uint8_t  chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
605      while(chunks--) {
606          PEW_UNROLL // _PM_chunkSize RGB+clock writes
607      }
608  #if defined(_PM_portToggleRegister)
609      // rgbAndClockMask is a 16-bit value when toggling, hence offset here.
610      *((volatile uint16_t *)core->clearReg + core->portOffset) =
611          core->rgbAndClockMask;
612  #endif
613  }
614  
615  static void blast_long(Protomatter_core *core, uint32_t *data) {
616  #if defined(_PM_portToggleRegister)
617      // See notes above -- except now full 32-bit PORT.
618      volatile uint32_t *toggle = (volatile uint32_t *)core->toggleReg;
619  #else
620      // Note in this case two copies exist of the PORT set register.
621      // The optimizer will most likely simplify this; leaving as-is, not
622      // wanting a special case of the PEW macro due to divergence risk.
623      volatile uint32_t *set;     // For RGB data set
624      volatile uint32_t *set32;   // For clock set
625      volatile uint32_t *clear32; // For RGB data + clock clear
626      set     = (volatile uint32_t *)core->setReg;
627      set32   = (volatile uint32_t *)core->setReg;
628      clear32 = (volatile uint32_t *)core->clearReg;
629      uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit
630  #endif
631      uint32_t clock  = core->clockMask; // Clock bit
632      uint8_t  chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize;
633      while(chunks--) {
634          PEW_UNROLL // _PM_chunkSize RGB+clock writes
635      }
636  #if defined(_PM_portToggleRegister)
637      *(volatile uint32_t *)core->clearReg = core->rgbAndClockMask;
638  #endif
639  }
640  
641  // Returns current value of frame counter and resets its value to zero.
642  // Two calls to this, timed one second apart (or use math with other
643  // intervals), can be used to get a rough frames-per-second value for
644  // the matrix (since this is difficult to estimate beforehand).
645  uint32_t _PM_getFrameCount(Protomatter_core *core) {
646      uint32_t count = 0;
647      if((core)) {
648          count = core->frameCount;
649          core->frameCount = 0;
650      }
651      return count;
652  }
653  
654  // Note to future self: I've gone back and forth between implementing all
655  // this either as it currently is (with byte, word and long cases for various
656  // steps), or using a uint32_t[64] table for expanding RGB bit combos to PORT
657  // bit combos. The latter would certainly simplify the code a ton, and the
658  // additional table lookup step wouldn't significantly impact performance,
659  // especially going forward with faster processors (the SAMD51 code already
660  // requires a few NOPs in the innermost loop to avoid outpacing the matrix).
661  // BUT, the reason this is NOT currently done is that it only allows for a
662  // single matrix chain (doing parallel chains would require either an
663  // impractically large lookup table, or adding together multiple tables'
664  // worth of bitmasks, which would slow things down in the vital inner loop).
665  // Although parallel matrix chains aren't yet 100% implemented in this code
666  // right now, I wanted to leave that possibility for the future, as a way to
667  // handle larger matrix combos, because long chains will slow down the
668  // refresh rate.