Skip to content

Commit f62619a

Browse files
committed
updated add and scale color functions to not use references
- using references is slightly faster but using function calls by value is safer - this is in perparation to optimize ram usage on ESP32 for larger matrix setups - optimized by dropping the scale check, it is faster overall - FPS is now more consistent and on average about the same as it was
1 parent 770723e commit f62619a

File tree

2 files changed

+57
-63
lines changed

2 files changed

+57
-63
lines changed

wled00/FXparticleSystem.cpp

Lines changed: 53 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
// local shared functions (used both in 1D and 2D system)
1818
static int32_t calcForce_dv(const int8_t force, uint8_t &counter);
1919
static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius
20-
static void fast_color_add(CRGBW &c1, const CRGBW &c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
21-
static void fast_color_scale(CRGBW &c, const uint8_t scale); // fast scaling function using 32bit variable and pointer. note: keep 'scale' within 0-255
20+
static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
21+
static uint32_t fast_color_scale(CRGBW c, const uint8_t scale); // fast scaling function using 32bit variable and pointer. note: keep 'scale' within 0-255
2222
#endif
2323

2424
#ifndef WLED_DISABLE_PARTICLESYSTEM2D
@@ -572,7 +572,7 @@ void ParticleSystem2D::render() {
572572
for (int32_t y = 0; y <= maxYpixel; y++) {
573573
int index = y * (maxXpixel + 1);
574574
for (int32_t x = 0; x <= maxXpixel; x++) {
575-
fast_color_scale(framebuffer[index], motionBlur); // note: could skip if only globalsmear is active but usually they are both active and scaling is fast enough
575+
framebuffer[index] = fast_color_scale(framebuffer[index], motionBlur); // note: could skip if only globalsmear is active but usually they are both active and scaling is fast enough
576576
index++;
577577
}
578578
}
@@ -634,7 +634,8 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
634634
uint32_t x = particles[particleindex].x >> PS_P_RADIUS_SHIFT;
635635
uint32_t y = particles[particleindex].y >> PS_P_RADIUS_SHIFT;
636636
if (x <= (uint32_t)maxXpixel && y <= (uint32_t)maxYpixel) {
637-
fast_color_add(framebuffer[x + (maxYpixel - y) * (maxXpixel + 1)], color, brightness);
637+
uint32_t index = x + (maxYpixel - y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
638+
framebuffer[index] = fast_color_add(framebuffer[index], color, brightness);
638639
}
639640
return;
640641
}
@@ -682,14 +683,14 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
682683
}
683684

684685
if (advPartProps && advPartProps[particleindex].size > 1) { //render particle to a bigger size
685-
CRGBW renderbuffer[100]; // 10x10 pixel buffer
686+
uint32_t renderbuffer[100]; // 10x10 pixel buffer
686687
memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
687688
//particle size to pixels: < 64 is 4x4, < 128 is 6x6, < 192 is 8x8, bigger is 10x10
688689
//first, render the pixel to the center of the renderbuffer, then apply 2D blurring
689-
fast_color_add(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // oCrder is: bottom left, bottom right, top right, top left
690-
fast_color_add(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
691-
fast_color_add(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
692-
fast_color_add(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
690+
renderbuffer[4 + (4 * 10)] = fast_color_add(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
691+
renderbuffer[5 + (4 * 10)] = fast_color_add(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
692+
renderbuffer[5 + (5 * 10)] = fast_color_add(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
693+
renderbuffer[4 + (5 * 10)] = fast_color_add(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
693694
uint32_t rendersize = 2; // initialize render size, minimum is 4x4 pixels, it is incremented int he loop below to start with 4
694695
uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
695696
uint32_t maxsize = advPartProps[particleindex].size;
@@ -746,7 +747,8 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
746747
else
747748
continue;
748749
}
749-
fast_color_add(framebuffer[xfb + (maxYpixel - yfb) * (maxXpixel + 1)], renderbuffer[xrb + yrb * 10]);
750+
uint32_t idx = xfb + (maxYpixel - yfb) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
751+
framebuffer[idx] = fast_color_add(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
750752
}
751753
}
752754
} else { // standard rendering (2x2 pixels)
@@ -781,8 +783,10 @@ __attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint
781783
}
782784
}
783785
for (uint32_t i = 0; i < 4; i++) {
784-
if (pixelvalid[i])
785-
fast_color_add(framebuffer[pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1)], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
786+
if (pixelvalid[i]) {
787+
uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
788+
framebuffer[idx] = fast_color_add(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
789+
}
786790
}
787791
}
788792
}
@@ -983,7 +987,7 @@ void ParticleSystem2D::updatePSpointers(bool isadvanced, bool sizecontrol) {
983987
particles = reinterpret_cast<PSparticle *>(this + 1); // pointer to particles
984988
particleFlags = reinterpret_cast<PSparticleFlags *>(particles + numParticles); // pointer to particle flags
985989
sources = reinterpret_cast<PSsource *>(particleFlags + numParticles); // pointer to source(s) at data+sizeof(ParticleSystem2D)
986-
framebuffer = reinterpret_cast<CRGBW *>(SEGMENT.getPixels()); // pointer to framebuffer
990+
framebuffer = SEGMENT.getPixels(); // pointer to framebuffer
987991
PSdataEnd = reinterpret_cast<uint8_t *>(sources + numSources); // pointer to first available byte after the PS for FX additional data (already aligned to 4 byte boundary)
988992
if (isadvanced) {
989993
advPartProps = reinterpret_cast<PSadvancedParticle *>(PSdataEnd);
@@ -1007,7 +1011,7 @@ void ParticleSystem2D::updatePSpointers(bool isadvanced, bool sizecontrol) {
10071011
// for speed, 1D array and 32bit variables are used, make sure to limit them to 8bit (0-255) or result is undefined
10081012
// to blur a subset of the buffer, change the xsize/ysize and set xstart/ystart to the desired starting coordinates (default start is 0/0)
10091013
// subset blurring only works on 10x10 buffer (single particle rendering), if other sizes are needed, buffer width must be passed as parameter
1010-
void blur2D(CRGBW *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblur, uint32_t yblur, uint32_t xstart, uint32_t ystart, bool isparticle) {
1014+
void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblur, uint32_t yblur, uint32_t xstart, uint32_t ystart, bool isparticle) {
10111015
CRGBW seeppart, carryover;
10121016
uint32_t seep = xblur >> 1;
10131017
uint32_t width = xsize; // width of the buffer, used to calculate the index of the pixel
@@ -1022,12 +1026,11 @@ void blur2D(CRGBW *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblur,
10221026
carryover = BLACK;
10231027
uint32_t indexXY = xstart + y * width;
10241028
for (uint32_t x = xstart; x < xstart + xsize; x++) {
1025-
seeppart = colorbuffer[indexXY]; // create copy of current color
1026-
fast_color_scale(seeppart, seep); // scale it and seep to neighbours
1029+
seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
10271030
if (x > 0) {
1028-
fast_color_add(colorbuffer[indexXY - 1], seeppart);
1029-
if (carryover) // note: check adds overhead but is faster on average
1030-
fast_color_add(colorbuffer[indexXY], carryover);
1031+
colorbuffer[indexXY - 1] = fast_color_add(colorbuffer[indexXY - 1], seeppart);
1032+
if (carryover.color32) // note: check adds overhead but is faster on average
1033+
colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
10311034
}
10321035
carryover = seeppart;
10331036
indexXY++; // next pixel in x direction
@@ -1044,12 +1047,11 @@ void blur2D(CRGBW *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblur,
10441047
carryover = BLACK;
10451048
uint32_t indexXY = x + ystart * width;
10461049
for (uint32_t y = ystart; y < ystart + ysize; y++) {
1047-
seeppart = colorbuffer[indexXY]; // create copy of current color
1048-
fast_color_scale(seeppart, seep); // scale it and seep to neighbours
1050+
seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
10491051
if (y > 0) {
1050-
fast_color_add(colorbuffer[indexXY - width], seeppart);
1051-
if (carryover) // note: check adds overhead but is faster on average
1052-
fast_color_add(colorbuffer[indexXY], carryover);
1052+
colorbuffer[indexXY - width] = fast_color_add(colorbuffer[indexXY - width], seeppart);
1053+
if (carryover.color32) // note: check adds overhead but is faster on average
1054+
colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
10531055
}
10541056
carryover = seeppart;
10551057
indexXY += width; // next pixel in y direction
@@ -1432,7 +1434,7 @@ void ParticleSystem1D::render() {
14321434

14331435
if (motionBlur) { // blurring active
14341436
for (int32_t x = 0; x <= maxXpixel; x++) {
1435-
fast_color_scale(framebuffer[x], motionBlur);
1437+
framebuffer[x] = fast_color_scale(framebuffer[x], motionBlur);
14361438
}
14371439
}
14381440
else { // no blurring: clear buffer
@@ -1468,7 +1470,7 @@ void ParticleSystem1D::render() {
14681470
CRGBW bg_color = SEGCOLOR(1);
14691471
if (bg_color > 0) { //if not black
14701472
for (int32_t i = 0; i <= maxXpixel; i++) {
1471-
fast_color_add(framebuffer[i], bg_color);
1473+
framebuffer[i] = fast_color_add(framebuffer[i], bg_color);
14721474
}
14731475
}
14741476
#ifndef WLED_DISABLE_2D
@@ -1491,7 +1493,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
14911493
if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
14921494
uint32_t x = particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D;
14931495
if (x <= (uint32_t)maxXpixel) { //by making x unsigned there is no need to check < 0 as it will overflow
1494-
fast_color_add(framebuffer[x], color, brightness);
1496+
framebuffer[x] = fast_color_add(framebuffer[x], color, brightness);
14951497
}
14961498
return;
14971499
}
@@ -1523,13 +1525,13 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
15231525
}
15241526
// check if particle has advanced size properties and buffer is available
15251527
if (advPartProps && advPartProps[particleindex].size > 1) {
1526-
CRGBW renderbuffer[10]; // 10 pixel buffer
1528+
uint32_t renderbuffer[10]; // 10 pixel buffer
15271529
memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
15281530
//render particle to a bigger size
15291531
//particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels
15301532
//first, render the pixel to the center of the renderbuffer, then apply 1D blurring
1531-
fast_color_add(renderbuffer[4], color, pxlbrightness[0]);
1532-
fast_color_add(renderbuffer[5], color, pxlbrightness[1]);
1533+
renderbuffer[4] = fast_color_add(renderbuffer[4], color, pxlbrightness[0]);
1534+
renderbuffer[5] = fast_color_add(renderbuffer[5], color, pxlbrightness[1]);
15331535
uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4
15341536
uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
15351537
uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max
@@ -1563,7 +1565,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
15631565
#ifdef ESP8266 // no local buffer on ESP8266
15641566
SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true);
15651567
#else
1566-
fast_color_add(framebuffer[xfb], renderbuffer[xrb]);
1568+
framebuffer[xfb] = fast_color_add(framebuffer[xfb], renderbuffer[xrb]);
15671569
#endif
15681570
}
15691571
}
@@ -1583,7 +1585,7 @@ __attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint
15831585
}
15841586
for (uint32_t i = 0; i < 2; i++) {
15851587
if (pxlisinframe[i]) {
1586-
fast_color_add(framebuffer[pixco[i]], color, pxlbrightness[i]);
1588+
framebuffer[pixco[i]] = fast_color_add(framebuffer[pixco[i]], color, pxlbrightness[i]);
15871589
}
15881590
}
15891591
}
@@ -1737,12 +1739,12 @@ void ParticleSystem1D::updatePSpointers(bool isadvanced) {
17371739
PSdataEnd = reinterpret_cast<uint8_t *>(sources + numSources); // pointer to first available byte after the PS for FX additional data (already aligned to 4 byte boundary)
17381740
#ifndef WLED_DISABLE_2D
17391741
if(SEGMENT.is2D() && SEGMENT.map1D2D) {
1740-
framebuffer = reinterpret_cast<CRGBW *>(sources + numSources); // use local framebuffer for 1D->2D mapping
1742+
framebuffer = reinterpret_cast<uint32_t *>(sources + numSources); // use local framebuffer for 1D->2D mapping
17411743
PSdataEnd = reinterpret_cast<uint8_t *>(framebuffer + SEGMENT.maxMappingLength()); // pointer to first available byte after the PS for FX additional data (still aligned to 4 byte boundary)
17421744
}
17431745
else
17441746
#endif
1745-
framebuffer = reinterpret_cast<CRGBW *>(SEGMENT.getPixels()); // use segment buffer for standard 1D rendering
1747+
framebuffer = SEGMENT.getPixels(); // use segment buffer for standard 1D rendering
17461748

17471749
if (isadvanced) {
17481750
advPartProps = reinterpret_cast<PSadvancedParticle1D *>(PSdataEnd);
@@ -1792,7 +1794,7 @@ bool allocateParticleSystemMemory1D(const uint32_t numparticles, const uint32_t
17921794
requiredmemory += sizeof(PSsource1D) * numsources;
17931795
#ifndef WLED_DISABLE_2D
17941796
if(SEGMENT.is2D())
1795-
requiredmemory += sizeof(CRGBW) * SEGMENT.maxMappingLength(); // need local buffer for mapped rendering. CRGBW is 32bit, so this is a multiple of 4 bytes
1797+
requiredmemory += sizeof(uint32_t) * SEGMENT.maxMappingLength(); // need local buffer for mapped rendering
17961798
#endif
17971799
requiredmemory += additionalbytes;
17981800
if (isadvanced)
@@ -1827,18 +1829,17 @@ bool initParticleSystem1D(ParticleSystem1D *&PartSys, const uint32_t requestedso
18271829
// blur a 1D buffer, sub-size blurring can be done using start and size
18281830
// for speed, 32bit variables are used, make sure to limit them to 8bit (0-255) or result is undefined
18291831
// to blur a subset of the buffer, change the size and set start to the desired starting coordinates
1830-
void blur1D(CRGBW *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
1832+
void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
18311833
{
18321834
CRGBW seeppart, carryover;
18331835
uint32_t seep = blur >> 1;
18341836
carryover = BLACK;
18351837
for (uint32_t x = start; x < start + size; x++) {
1836-
seeppart = colorbuffer[x]; // create copy of current color
1837-
fast_color_scale(seeppart, seep); // scale it and seep to neighbours
1838+
seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours
18381839
if (x > 0) {
1839-
fast_color_add(colorbuffer[x-1], seeppart);
1840-
if (carryover) // note: check adds overhead but is faster on average
1841-
fast_color_add(colorbuffer[x], carryover); // is black on first pass
1840+
colorbuffer[x-1] = fast_color_add(colorbuffer[x-1], seeppart);
1841+
if (carryover.color32) // note: check adds overhead but is faster on average
1842+
colorbuffer[x] = fast_color_add(colorbuffer[x], carryover); // is black on first pass
18421843
}
18431844
carryover = seeppart;
18441845
}
@@ -1887,23 +1888,14 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
18871888
return true; // particle is in bounds
18881889
}
18891890

1890-
// fastled color adding is very inaccurate in color preservation (but it is fast)
1891-
// a better color add function is implemented in colors.cpp but it uses 32bit RGBW. to use it colors need to be shifted just to then be shifted back by that function, which is slow
1892-
// this is a fast version for RGB (no white channel, PS does not handle white) and with native CRGBW including scaling of second color
1893-
// note: result is stored in c1, not using a return value is faster as the CRGBW struct does not need to be copied upon return
1894-
// note2: function is mainly used to add scaled colors, so checking if one color is black is slower
1895-
// note3: scale is 255 when using blur, checking for that makes blur faster
1896-
__attribute__((optimize("O2"))) static void fast_color_add(CRGBW &c1, const CRGBW &c2, const uint8_t scale) {
1891+
// this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color
1892+
// note: function is mainly used to add scaled colors, so checking if one color is black is slower
1893+
// note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer
1894+
__attribute__((optimize("O2"))) static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) {
18971895
uint32_t r, g, b;
1898-
if (scale < 255) {
1899-
r = c1.r + ((c2.r * scale) >> 8);
1900-
g = c1.g + ((c2.g * scale) >> 8);
1901-
b = c1.b + ((c2.b * scale) >> 8);
1902-
} else {
1903-
r = c1.r + c2.r;
1904-
g = c1.g + c2.g;
1905-
b = c1.b + c2.b;
1906-
}
1896+
r = c1.r + ((c2.r * scale) >> 8);
1897+
g = c1.g + ((c2.g * scale) >> 8);
1898+
b = c1.b + ((c2.b * scale) >> 8);
19071899

19081900
// note: this chained comparison is the fastest method for max of 3 values (faster than std:max() or using xor)
19091901
uint32_t max = (r > g) ? ((r > b) ? r : b) : ((g > b) ? g : b);
@@ -1917,13 +1909,15 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
19171909
c1.g = (g * newscale) >> 16;
19181910
c1.b = (b * newscale) >> 16;
19191911
}
1912+
return c1.color32;
19201913
}
19211914

1922-
// faster than fastled color scaling as it does in place scaling
1923-
__attribute__((optimize("O2"))) static void fast_color_scale(CRGBW &c, const uint8_t scale) {
1915+
// fast CRGBW color scaling ignoring white channel (PS does not handle white)
1916+
__attribute__((optimize("O2"))) static uint32_t fast_color_scale(CRGBW c, const uint8_t scale) {
19241917
c.r = ((c.r * scale) >> 8);
19251918
c.g = ((c.g * scale) >> 8);
19261919
c.b = ((c.b * scale) >> 8);
1920+
return c.color32;
19271921
}
19281922

19291923
#endif // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D))

0 commit comments

Comments
 (0)