/ core.c
core.c
1 // Device- and environment-neutral core matrix-driving functionality. 2 // See notes near top of arch.h regarding assumptions of hardware 3 // "common ground." If you find yourself doing an "#ifdef ARDUINO" or 4 // "#ifdef _SAMD21_" in this file, STOP. Idea is that the code in this 5 // file is neutral and portable (within aforementioned assumptions). 6 // Nonportable elements should appear in arch.h. If arch.h functionality 7 // is lacking, extend it there, do not go making device- or environment- 8 // specific cases within this file. 9 10 // Function names are intentionally a little obtuse, idea is that one writes 11 // a more sensible wrapper around this for specific environments (e.g. the 12 // Arduino stuff in Adafruit_Protomatter.cpp). The "_PM_" prefix on most 13 // things hopefully makes function and variable name collisions much less 14 // likely with one's own code. 15 16 #include "core.h" // enums and structs 17 #include "arch.h" // Do NOT include this in any other source files 18 19 // Overall matrix refresh rate (frames/second) is a function of matrix width 20 // and chain length, number of address lines, number of bit planes, CPU speed 21 // and whether or not a GPIO toggle register is available. There is no "this 22 // will run at X-frames-per-second" constant figure. You typically just have 23 // to try it out and perhaps trade off some bit planes for refresh rate until 24 // the image looks good and stable. Anything over 100 Hz is usually passable, 25 // around 250 Hz is where things firm up. And while this could proceed higher 26 // in some situations, the tradeoff is that faster rates use progressively 27 // more CPU time (because it's timer interrupt based and not using DMA or 28 // special peripherals). So a throttle is set here, an approximate maximum 29 // frame rate which the software will attempt to avoid exceeding (but may 30 // refresh slower than this, and in many cases will...just need to set an 31 // upper limit to avoid excessive CPU load). An incredibly long comment block 32 // for a single constant, thank you for coming to my TED talk! 33 #define _PM_MAX_REFRESH_HZ 250 34 35 // Time (in microseconds) to pause following any change in address lines 36 // (individually or collectively). Some matrices respond slowly there... 37 // must pause on change for matrix to catch up. Defined here (rather than 38 // arch.h) because it's not architecture-specific. 39 #define _PM_ROW_DELAY 8 40 41 // These are the lowest-level functions for issing data to matrices. 42 // There are three versions because it depends on how the six RGB data bits 43 // (and clock bit) are arranged within a 32-bit PORT register. If all six 44 // (seven) fit within one byte or word of the PORT, the library's memory 45 // use (and corresponding data-issuing function) change. This will also have 46 // an impact on parallel chains in the future, where the number of concurrent 47 // RGB data bits isn't always six, but some multiple thereof (i.e. up to five 48 // parallel outputs -- 30 RGB bits + clock -- on a 32-bit PORT, though that's 49 // largely hypothetical as the chance of finding a PORT with that many bits 50 // exposed and NOT interfering with other peripherals on a board is highly 51 // improbable. But I could see four happening, maybe on a Grand Central or 52 // other kitchen-sink board. 53 static void blast_byte(Protomatter_core *core, uint8_t *data); 54 static void blast_word(Protomatter_core *core, uint16_t *data); 55 static void blast_long(Protomatter_core *core, uint32_t *data); 56 57 // Validate and populate vital elements of core structure. 58 // Does NOT allocate core struct -- calling function must provide that. 59 // (In the Arduino C++ library, it’s part of the Protomatter class.) 60 ProtomatterStatus _PM_init(Protomatter_core *core, 61 uint16_t bitWidth, uint8_t bitDepth, 62 uint8_t rgbCount, uint8_t *rgbList, 63 uint8_t addrCount, uint8_t *addrList, 64 uint8_t clockPin, uint8_t latchPin, uint8_t oePin, 65 bool doubleBuffer, void *timer) { 66 if(!core) return PROTOMATTER_ERR_ARG; 67 68 if(rgbCount > 5) rgbCount = 5; // Max 5 in parallel (32-bit PORT) 69 if(addrCount > 5) addrCount = 5; // Max 5 address lines (A-E) 70 // bitDepth is NOT constrained here, handle in calling function 71 // (varies with implementation, e.g. GFX lib is max 6 bitplanes, 72 // but might be more or less elsewhere) 73 74 // If NULL timer was passed in (the default case for the constructor), 75 // use default value from arch.h. For example, in the Arduino case it's 76 // tied to TC4 specifically. 77 if(timer == NULL) timer = _PM_TIMER_DEFAULT; 78 79 core->timer = timer; 80 core->width = bitWidth; // Total matrix chain length in bits 81 core->numPlanes = bitDepth; 82 core->parallel = rgbCount; 83 core->numAddressLines = addrCount; 84 core->clockPin = clockPin; 85 core->latch.pin = latchPin; 86 core->oe.pin = oePin; 87 core->doubleBuffer = doubleBuffer; 88 core->addr = NULL; 89 core->screenData = NULL; 90 91 // Make a copy of the rgbList and addrList tables in case they're 92 // passed from local vars on the stack or some other non-persistent 93 // source. screenData is NOT allocated here because data size (byte, 94 // word, long) is not known until the begin function evaluates all 95 // the pin bitmasks. 96 97 rgbCount *= 6; // Convert parallel count to pin count 98 if((core->rgbPins = (uint8_t *)malloc(rgbCount * sizeof(uint8_t)))) { 99 if((core->addr = (_PM_pin *)malloc(addrCount * sizeof(_PM_pin)))) { 100 memcpy(core->rgbPins, rgbList, rgbCount * sizeof(uint8_t)); 101 for(uint8_t i=0; i<addrCount; i++) { 102 core->addr[i].pin = addrList[i]; 103 } 104 return PROTOMATTER_OK; 105 } 106 free(core->rgbPins); 107 core->rgbPins = NULL; 108 } 109 return PROTOMATTER_ERR_MALLOC; 110 } 111 112 // Allocate display buffers and populate additional elements. 113 ProtomatterStatus _PM_begin(Protomatter_core *core) { 114 if(!core) return PROTOMATTER_ERR_ARG; 115 116 if(!core->rgbPins) { // NULL if copy failed to allocate 117 return PROTOMATTER_ERR_MALLOC; 118 } 119 120 // Verify that rgbPins and clockPin are all on the same PORT. If not, 121 // return an error. Pin list is not freed; please call dealloc function. 122 // Also get bitmask of which bits within 32-bit PORT register are 123 // referenced. 124 uint8_t *port = (uint8_t *)_PM_portOutRegister(core->clockPin); 125 #if defined(_PM_portToggleRegister) 126 // If a bit-toggle register is present, the clock pin is included 127 // in determining which bytes of the PORT register are used (and thus 128 // the data storage efficiency). 129 uint32_t bitMask = _PM_portBitMask(core->clockPin); 130 #else 131 // If no bit-toggle register, clock pin can be on any bit, doesn't 132 // affect storage efficiency. 133 uint32_t bitMask = 0; 134 #endif 135 136 for(uint8_t i=0; i<core->parallel * 6; i++) { 137 uint8_t *p2 = (uint8_t *)_PM_portOutRegister(core->rgbPins[i]); 138 if(p2 != port) { 139 return PROTOMATTER_ERR_PINS; 140 } 141 bitMask |= _PM_portBitMask(core->rgbPins[i]); 142 } 143 144 // RGB + clock are on same port, we can proceed... 145 146 // Determine data type for internal representation. If all the data 147 // bitmasks (and possibly clock bitmask, depending whether toggle-bits 148 // register is present) are in the same byte, this can be stored more 149 // compact than if they're spread across a word or long. 150 uint8_t byteMask = 0; 151 if(bitMask & 0xFF000000) byteMask |= 0b1000; 152 if(bitMask & 0x00FF0000) byteMask |= 0b0100; 153 if(bitMask & 0x0000FF00) byteMask |= 0b0010; 154 if(bitMask & 0x000000FF) byteMask |= 0b0001; 155 switch(byteMask) { 156 case 0b0001: // If all PORT bits are in the same byte... 157 case 0b0010: 158 case 0b0100: 159 case 0b1000: 160 core->bytesPerElement = 1; // Use 8-bit PORT accesses. 161 break; 162 case 0b0011: // If all PORT bits in upper/lower word... 163 case 0b1100: 164 core->bytesPerElement = 2; // Use 16-bit PORT accesses. 165 // Although some devices might tolerate unaligned 16-bit accesses 166 // ('middle' word of 32-bit PORT), that is NOT handled here. 167 // It's a portability liability. 168 break; 169 default: // Any other situation... 170 core->bytesPerElement = 4; // Use 32-bit PORT accesses. 171 break; 172 } 173 174 // Planning for screen data allocation... 175 core->numRowPairs = 1 << core->numAddressLines; 176 uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize; 177 uint16_t columns = chunks * _PM_chunkSize; // Padded matrix width 178 uint32_t screenBytes = columns * core->numRowPairs * core->numPlanes * 179 core->bytesPerElement; 180 181 core->bufferSize = screenBytes; // Bytes per matrix buffer (1 or 2) 182 if(core->doubleBuffer) screenBytes *= 2; // Total for matrix buffer(s) 183 uint32_t rgbMaskBytes = core->parallel * 6 * core->bytesPerElement; 184 185 // Allocate matrix buffer(s). Don't worry about the return type... 186 // though we might be using words or longs for certain pin configs, 187 // malloc() by definition always aligns to the longest type. 188 if(!(core->screenData = (uint8_t *)malloc(screenBytes + rgbMaskBytes))) { 189 return PROTOMATTER_ERR_MALLOC; 190 } 191 192 // rgbMask data follows the matrix buffer(s) 193 core->rgbMask = core->screenData + screenBytes; 194 195 #if !defined(_PM_portToggleRegister) 196 // Clear entire screenData buffer so there's no cruft in any pad bytes 197 // (if using toggle register, each is set to clockMask below instead). 198 memset(core->screenData, 0, screenBytes); 199 #endif 200 201 // Figure out clockMask and rgbAndClockMask, clear matrix buffers 202 if(core->bytesPerElement == 1) { 203 core->portOffset = _PM_byteOffset(core->rgbPins[0]); 204 #if defined(_PM_portToggleRegister) 205 // Clock and rgbAndClockMask are 8-bit values 206 core->clockMask = _PM_portBitMask(core->clockPin) >> 207 (core->portOffset * 8); 208 core->rgbAndClockMask = (bitMask >> (core->portOffset * 8)) | 209 core->clockMask; 210 memset(core->screenData, core->clockMask, screenBytes); 211 #else 212 // Clock and rgbAndClockMask are 32-bit values 213 core->clockMask = _PM_portBitMask(core->clockPin); 214 core->rgbAndClockMask = bitMask | core->clockMask; 215 #endif 216 for(uint8_t i=0; i<core->parallel * 6; i++) { 217 ((uint8_t *)core->rgbMask)[i] = // Pin bitmasks are 8-bit 218 _PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 8); 219 } 220 } else if(core->bytesPerElement == 2) { 221 core->portOffset = _PM_wordOffset(core->rgbPins[0]); 222 #if defined(_PM_portToggleRegister) 223 // Clock and rgbAndClockMask are 16-bit values 224 core->clockMask = _PM_portBitMask(core->clockPin) >> 225 (core->portOffset * 16); 226 core->rgbAndClockMask = (bitMask >> (core->portOffset * 16)) | 227 core->clockMask; 228 uint32_t elements = screenBytes / 2; 229 for(uint32_t i=0; i<elements; i++) { 230 ((uint16_t *)core->screenData)[i] = core->clockMask; 231 } 232 #else 233 // Clock and rgbAndClockMask are 32-bit values 234 core->clockMask = _PM_portBitMask(core->clockPin); 235 core->rgbAndClockMask = bitMask | core->clockMask; 236 #endif 237 for(uint8_t i=0; i<core->parallel * 6; i++) { 238 ((uint16_t *)core->rgbMask)[i] = // Pin bitmasks are 16-bit 239 _PM_portBitMask(core->rgbPins[i]) >> (core->portOffset * 16); 240 } 241 } else { 242 core->portOffset = 0; 243 core->clockMask = _PM_portBitMask(core->clockPin); 244 core->rgbAndClockMask = bitMask | core->clockMask; 245 #if defined(_PM_portToggleRegister) 246 uint32_t elements = screenBytes / 4; 247 for(uint32_t i=0; i<elements; i++) { 248 ((uint32_t *)core->screenData)[i] = core->clockMask; 249 } 250 #endif 251 for(uint8_t i=0; i<core->parallel * 6; i++) { 252 ((uint32_t *)core->rgbMask)[i] = // Pin bitmasks are 32-bit 253 _PM_portBitMask(core->rgbPins[i]); 254 } 255 } 256 257 // Estimate minimum bitplane #0 period for _PM_MAX_REFRESH_HZ rate. 258 uint32_t minPeriodPerFrame = _PM_timerFreq / _PM_MAX_REFRESH_HZ; 259 uint32_t minPeriodPerLine = minPeriodPerFrame / core->numRowPairs; 260 core->minPeriod = minPeriodPerLine / ((1 << core->numPlanes) - 1); 261 if(core->minPeriod < _PM_minMinPeriod) { 262 core->minPeriod = _PM_minMinPeriod; 263 } 264 // Actual frame rate may be lower than this...it's only an estimate 265 // and does not factor in things like address line selection delays 266 // or interrupt overhead. That's OK, just don't want to exceed this 267 // rate, as it'll eat all the CPU cycles. 268 // Make a wild guess for the initial bit-zero interval. It's okay 269 // that this is off, code adapts to actual timer results pretty quick. 270 271 core->bitZeroPeriod = core->width * 5; // Initial guesstimate 272 273 core->activeBuffer = 0; 274 275 // Configure pins as outputs and initialize their states. 276 277 core->latch.setReg = _PM_portSetRegister(core->latch.pin); 278 core->latch.clearReg = _PM_portClearRegister(core->latch.pin); 279 core->latch.bit = _PM_portBitMask(core->latch.pin); 280 core->oe.setReg = _PM_portSetRegister(core->oe.pin); 281 core->oe.clearReg = _PM_portClearRegister(core->oe.pin); 282 core->oe.bit = _PM_portBitMask(core->oe.pin); 283 284 _PM_pinOutput(core->clockPin); 285 _PM_pinLow(core->clockPin); // Init clock LOW 286 _PM_pinOutput(core->latch.pin); 287 _PM_pinLow(core->latch.pin); // Init latch LOW 288 _PM_pinOutput(core->oe.pin); 289 _PM_pinHigh(core->oe.pin); // Init OE HIGH (disable output) 290 291 for(uint8_t i=0; i<core->parallel * 6; i++) { 292 _PM_pinOutput(core->rgbPins[i]); 293 _PM_pinLow(core->rgbPins[i]); 294 } 295 #if defined(_PM_portToggleRegister) 296 core->addrPortToggle = _PM_portToggleRegister(core->addr[0].pin); 297 core->singleAddrPort = 1; 298 #endif 299 for(uint8_t line=0,bit=1; line<core->numAddressLines; line++, bit<<=1) { 300 core->addr[line].setReg = 301 _PM_portSetRegister(core->addr[line].pin); 302 core->addr[line].clearReg = 303 _PM_portClearRegister(core->addr[line].pin); 304 core->addr[line].bit = 305 _PM_portBitMask(core->addr[line].pin); 306 _PM_pinOutput(core->addr[line].pin); 307 if(core->prevRow & bit) { 308 _PM_pinHigh(core->addr[line].pin); 309 } else { 310 _PM_pinLow(core->addr[line].pin); 311 } 312 #if defined(_PM_portToggleRegister) 313 // If address pin on different port than addr 0, no singleAddrPort. 314 if(_PM_portToggleRegister(core->addr[line].pin) != 315 core->addrPortToggle) { 316 core->singleAddrPort = 0; 317 } 318 #endif 319 } 320 321 // Get pointers to bit set and clear registers (and toggle, if present) 322 core->setReg = (uint8_t *)_PM_portSetRegister(core->clockPin); 323 core->clearReg = (uint8_t *)_PM_portClearRegister(core->clockPin); 324 #if defined(_PM_portToggleRegister) 325 core->toggleReg = (uint8_t *)_PM_portToggleRegister(core->clockPin); 326 #endif 327 328 // Reset plane/row counters, config and start timer 329 _PM_resume(core); 330 331 return PROTOMATTER_OK; 332 } 333 334 // Disable (but do not deallocate) a Protomatter matrix. Disables matrix by 335 // setting OE pin HIGH and writing all-zero data to matrix shift registers, 336 // so it won't halt with lit LEDs. 337 void _PM_stop(Protomatter_core *core) { 338 if((core)) { 339 while(core->swapBuffers); // Wait for any pending buffer swap 340 _PM_timerStop(core->timer); // Halt timer 341 *core->oe.setReg = core->oe.bit; // Set OE HIGH (disable output) 342 // So, in PRINCIPLE, setting OE high would be sufficient... 343 // but in case that pin is shared with another function such 344 // as the onloard LED (which pulses during bootloading) let's 345 // also clear out the matrix shift registers for good measure. 346 // Set all RGB pins LOW... 347 for(uint8_t i=0; i<core->parallel * 6; i++) { 348 _PM_pinLow(core->rgbPins[i]); 349 } 350 // Clock out bits (just need to toggle clock with RGBs held low) 351 for(uint32_t i=0; i<core->width; i++) { 352 _PM_pinHigh(core->clockPin); 353 _PM_clockHoldHigh; 354 _PM_pinLow(core->clockPin); 355 _PM_clockHoldLow; 356 } 357 // Latch data 358 *core->latch.setReg = core->latch.bit; 359 *core->latch.clearReg = core->latch.bit; 360 } 361 } 362 363 void _PM_resume(Protomatter_core *core) { 364 if((core)) { 365 // Init plane & row to max values so they roll over on 1st interrupt 366 core->plane = core->numPlanes - 1; 367 core->row = core->numRowPairs - 1; 368 core->prevRow = (core->numRowPairs > 1) ? (core->row - 1) : 1; 369 core->swapBuffers = 0; 370 core->frameCount = 0; 371 372 _PM_timerInit(core->timer); // Configure timer 373 _PM_timerStart(core->timer, 1000); // Start timer 374 } 375 } 376 377 // Free memory associated with core structure. Does NOT dealloc struct. 378 void _PM_free(Protomatter_core *core) { 379 if((core)) { 380 _PM_stop(core); 381 // TO DO: Set all pins back to inputs here? 382 if(core->screenData) free(core->screenData); 383 if(core->addr) free(core->addr); 384 if(core->rgbPins) { 385 free(core->rgbPins); 386 core->rgbPins = NULL; 387 } 388 } 389 } 390 391 392 // ISR function (in arch.h) calls this function which it extern'd. 393 void _PM_row_handler(Protomatter_core *core) { 394 395 *core->oe.setReg = core->oe.bit; // Disable LED output 396 397 *core->latch.setReg = core->latch.bit; // Latch data from PRIOR pass 398 // Stop timer, save count value at stop 399 uint32_t elapsed = _PM_timerStop(core->timer); 400 uint8_t prevPlane = core->plane; // Save that plane # for later timing 401 *core->latch.clearReg = core->latch.bit; // (split to add a few cycles) 402 403 // If plane 0 just finished being displayed (plane 1 was loaded on prior 404 // pass, or there's only one plane...I know, it's confusing), take note 405 // of the elapsed timer value, for subsequent bitplane timing (each 406 // plane period is double the previous). Value is filtered slightly to 407 // avoid jitter. 408 if((prevPlane == 1) || (core->numPlanes == 1)) { 409 core->bitZeroPeriod = ((core->bitZeroPeriod * 7) + elapsed) / 8; 410 if(core->bitZeroPeriod < core->minPeriod) { 411 core->bitZeroPeriod = core->minPeriod; 412 } 413 } 414 415 if(prevPlane == 0) { // Plane 0 just finished loading 416 #if defined(_PM_portToggleRegister) 417 // If all address lines are on a single PORT (and bit toggle is 418 // available), do address line change all at once. Even doing all 419 // this math takes MUCH less time than the delays required when 420 // doing line-by-line changes. 421 if(core->singleAddrPort) { 422 // Make bitmasks of prior and new row bits 423 uint32_t priorBits = 0, newBits = 0; 424 for(uint8_t line=0,bit=1; line<core->numAddressLines; 425 line++, bit<<=1) { 426 if(core->row & bit) { 427 newBits |= core->addr[line].bit; 428 } 429 if(core->prevRow & bit) { 430 priorBits |= core->addr[line].bit; 431 } 432 } 433 *core->addrPortToggle = newBits ^ priorBits; 434 _PM_delayMicroseconds(_PM_ROW_DELAY); 435 } else { 436 #endif 437 // Configure row address lines individually, making changes 438 // (with delays) only where necessary. 439 for(uint8_t line=0,bit=1; line<core->numAddressLines; 440 line++, bit<<=1) { 441 if((core->row & bit) != (core->prevRow & bit)) { 442 if(core->row & bit) { // Set addr line high 443 *core->addr[line].setReg = core->addr[line].bit; 444 } else { // Set addr line low 445 *core->addr[line].clearReg = core->addr[line].bit; 446 } 447 _PM_delayMicroseconds(_PM_ROW_DELAY); 448 } 449 } 450 #if defined(_PM_portToggleRegister) 451 } 452 #endif 453 core->prevRow = core->row; 454 } 455 456 // Advance bitplane index and/or row as necessary 457 if(++core->plane >= core->numPlanes) { // Next data bitplane, or 458 core->plane = 0; // roll over bitplane to start 459 if(++core->row >= core->numRowPairs) { // Next row, or 460 core->row = 0; // roll over row to start 461 // Switch matrix buffers if due (only if double-buffered) 462 if(core->swapBuffers) { 463 core->activeBuffer = 1 - core->activeBuffer; 464 core->swapBuffers = 0; // Swapped! 465 } 466 core->frameCount++; 467 } 468 } 469 470 // 'plane' now is index of data to issue, NOT data to display. 471 // 'prevPlane' is the previously-loaded data, which gets displayed 472 // now while the next plane data is loaded. 473 474 // Set timer and enable LED output for data loaded on PRIOR pass: 475 _PM_timerStart(core->timer, core->bitZeroPeriod << prevPlane); 476 *core->oe.clearReg = core->oe.bit; // Enable LED output 477 478 uint32_t elementsPerLine = _PM_chunkSize * 479 ((core->width + (_PM_chunkSize - 1)) / _PM_chunkSize); 480 uint32_t srcOffset = elementsPerLine * 481 (core->numPlanes * core->row + core->plane) * core->bytesPerElement; 482 if(core->doubleBuffer) { 483 srcOffset += core->bufferSize * core->activeBuffer; 484 } 485 486 if(core->bytesPerElement == 1) { 487 blast_byte(core, (uint8_t *)(core->screenData + srcOffset)); 488 } else if(core->bytesPerElement == 2) { 489 blast_word(core, (uint16_t *)(core->screenData + srcOffset)); 490 } else { 491 blast_long(core, (uint32_t *)(core->screenData + srcOffset)); 492 } 493 494 // 'plane' data is now loaded, will be shown on NEXT pass 495 } 496 497 // Innermost data-stuffing loop functions 498 499 // The presence of a bit-toggle register can make the data-stuffing loop a 500 // fair bit faster (2 PORT accesses per column vs 3). But ironically, some 501 // devices (e.g. SAMD51) can outpace the matrix max CLK speed, so we slow 502 // them down with a few NOPs. These are defined in arch.h as needed. 503 // _PM_clockHoldLow is whatever code necessary to delay the clock rise 504 // after data is placed on the PORT. _PM_clockHoldHigh is code for delay 505 // before setting the clock back low. If undefined, nothing goes there. 506 507 #if defined(_PM_portToggleRegister) 508 #define PEW \ 509 *toggle = *data++; /* Toggle in new data + toggle clock low */ \ 510 _PM_clockHoldLow; \ 511 *toggle = clock; /* Toggle clock high */ \ 512 _PM_clockHoldHigh; 513 #else 514 #define PEW \ 515 *set = *data++; /* Set RGB data high */ \ 516 _PM_clockHoldLow; \ 517 *set32 = clock; /* Set clock high */ \ 518 _PM_clockHoldHigh; \ 519 *clear32 = rgbclock; /* Clear RGB data + clock */ 520 #endif 521 522 #if _PM_chunkSize == 1 523 #define PEW_UNROLL PEW 524 #elif _PM_chunkSize == 8 525 #define PEW_UNROLL PEW PEW PEW PEW PEW PEW PEW PEW 526 #elif _PM_chunkSize == 16 527 #define PEW_UNROLL \ 528 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW 529 #elif _PM_chunkSize == 32 530 #define PEW_UNROLL \ 531 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ 532 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW 533 #elif _PM_chunkSize == 64 534 #define PEW_UNROLL \ 535 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ 536 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ 537 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW \ 538 PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW PEW 539 #else 540 #error "Unimplemented _PM_chunkSize value" 541 #endif 542 543 // There are THREE COPIES of the following function -- one each for byte, 544 // word and long. If changes are made in any one of them, the others MUST 545 // be updated to match! (Decided against using macro tricks for the 546 // function, too often ends in disaster...but must be vigilant in the 547 // three-function maintenance then.) 548 549 static void blast_byte(Protomatter_core *core, uint8_t *data) { 550 #if defined(_PM_portToggleRegister) 551 // If here, it was established in begin() that the RGB data bits and 552 // clock are all within the same byte of a PORT register, else we'd be 553 // in the word- or long-blasting functions now. So we just need an 554 // 8-bit pointer to the PORT. 555 volatile uint8_t *toggle = (volatile uint8_t *)core->toggleReg + 556 core->portOffset; 557 #else 558 // No-toggle version is a little different. If here, RGB data is all 559 // in one byte of PORT register, clock can be any bit in 32-bit PORT. 560 volatile uint8_t *set; // For RGB data set 561 volatile uint32_t *set32; // For clock set 562 volatile uint32_t *clear32; // For RGB data + clock clear 563 set = (volatile uint8_t *)core->setReg + portOffset; 564 set32 = (volatile uint32_t *)core->setReg; 565 clear32 = (volatile uint32_t *)core->clearReg; 566 uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit 567 #endif 568 uint32_t clock = core->clockMask; // Clock bit 569 uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize; 570 571 // PORT has already been initialized with RGB data + clock bits 572 // all LOW, so we don't need to initialize that state here. 573 574 while(chunks--) { 575 PEW_UNROLL // _PM_chunkSize RGB+clock writes 576 } 577 578 #if defined(_PM_portToggleRegister) 579 // Want the PORT left with RGB data and clock LOW on function exit 580 // (so it's easier to see on 'scope, and to prime it for the next call). 581 // This is implicit in the no-toggle case (due to how the PEW macro 582 // works), but toggle case requires explicitly clearing those bits. 583 // rgbAndClockMask is an 8-bit value when toggling, hence offset here. 584 *((volatile uint8_t *)core->clearReg + core->portOffset) = 585 core->rgbAndClockMask; 586 #endif 587 } 588 589 static void blast_word(Protomatter_core *core, uint16_t *data) { 590 #if defined(_PM_portToggleRegister) 591 // See notes above -- except now 16-bit word in PORT. 592 volatile uint16_t *toggle = (volatile uint16_t *)core->toggleReg + 593 core->portOffset; 594 #else 595 volatile uint16_t *set; // For RGB data set 596 volatile uint32_t *set32; // For clock set 597 volatile uint32_t *clear32; // For RGB data + clock clear 598 set = (volatile uint16_t *)core->setReg + core->portOffset; 599 set32 = (volatile uint32_t *)core->setReg; 600 clear32 = (volatile uint32_t *)core->clearReg; 601 uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit 602 #endif 603 uint32_t clock = core->clockMask; // Clock bit 604 uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize; 605 while(chunks--) { 606 PEW_UNROLL // _PM_chunkSize RGB+clock writes 607 } 608 #if defined(_PM_portToggleRegister) 609 // rgbAndClockMask is a 16-bit value when toggling, hence offset here. 610 *((volatile uint16_t *)core->clearReg + core->portOffset) = 611 core->rgbAndClockMask; 612 #endif 613 } 614 615 static void blast_long(Protomatter_core *core, uint32_t *data) { 616 #if defined(_PM_portToggleRegister) 617 // See notes above -- except now full 32-bit PORT. 618 volatile uint32_t *toggle = (volatile uint32_t *)core->toggleReg; 619 #else 620 // Note in this case two copies exist of the PORT set register. 621 // The optimizer will most likely simplify this; leaving as-is, not 622 // wanting a special case of the PEW macro due to divergence risk. 623 volatile uint32_t *set; // For RGB data set 624 volatile uint32_t *set32; // For clock set 625 volatile uint32_t *clear32; // For RGB data + clock clear 626 set = (volatile uint32_t *)core->setReg; 627 set32 = (volatile uint32_t *)core->setReg; 628 clear32 = (volatile uint32_t *)core->clearReg; 629 uint32_t rgbclock = core->rgbAndClockMask; // RGB + clock bit 630 #endif 631 uint32_t clock = core->clockMask; // Clock bit 632 uint8_t chunks = (core->width + (_PM_chunkSize - 1)) / _PM_chunkSize; 633 while(chunks--) { 634 PEW_UNROLL // _PM_chunkSize RGB+clock writes 635 } 636 #if defined(_PM_portToggleRegister) 637 *(volatile uint32_t *)core->clearReg = core->rgbAndClockMask; 638 #endif 639 } 640 641 // Returns current value of frame counter and resets its value to zero. 642 // Two calls to this, timed one second apart (or use math with other 643 // intervals), can be used to get a rough frames-per-second value for 644 // the matrix (since this is difficult to estimate beforehand). 645 uint32_t _PM_getFrameCount(Protomatter_core *core) { 646 uint32_t count = 0; 647 if((core)) { 648 count = core->frameCount; 649 core->frameCount = 0; 650 } 651 return count; 652 } 653 654 // Note to future self: I've gone back and forth between implementing all 655 // this either as it currently is (with byte, word and long cases for various 656 // steps), or using a uint32_t[64] table for expanding RGB bit combos to PORT 657 // bit combos. The latter would certainly simplify the code a ton, and the 658 // additional table lookup step wouldn't significantly impact performance, 659 // especially going forward with faster processors (the SAMD51 code already 660 // requires a few NOPs in the innermost loop to avoid outpacing the matrix). 661 // BUT, the reason this is NOT currently done is that it only allows for a 662 // single matrix chain (doing parallel chains would require either an 663 // impractically large lookup table, or adding together multiple tables' 664 // worth of bitmasks, which would slow things down in the vital inner loop). 665 // Although parallel matrix chains aren't yet 100% implemented in this code 666 // right now, I wanted to leave that possibility for the future, as a way to 667 // handle larger matrix combos, because long chains will slow down the 668 // refresh rate.