I did a test on the time it takes to do this on a laptop that's about 3 years old - Core-2 (not core-2 duo) processor with 3 gig of ram and winXP and also on a desktop machine with a core i7 950, 3.2 gHz, 6 gig ram and win7 64bit.
With a 1024x768 image (actually just random data) the laptop took about 28ms to do the conversion from BGR to RGBA. The desktop took about 6ms.
I've included the code I used below. Comments are welcome.
void testConversion()
{
__int64 freqx, tStartx, tStopx;
unsigned long TimeDiffx;
unsigned long microsec;
// Get the frequency of the hi-res timer
QueryPerformanceFrequency((LARGE_INTEGER*)&freqx);
int w = 1024;
int h = 768;
int d = 3; // rgb bit depth
int da = 4; // rgba bit depth
// this makes a 3 bit depth "image" of static
unsigned char* old_image = makeARandomByteArray(w, h, d);
//QueryPerformanceCounter((LARGE_INTEGER*)&tStartx);
int old_size = w * h * d;
int new_size = w * h * da;
//unsigned char r = 0;
//unsigned char g = 0;
//unsigned char b = 0;
unsigned char a = 255;
long counter = 0;
//QueryPerformanceCounter((LARGE_INTEGER*)&tStartx);
// this takes about 3 ms on my laptop
unsigned char *new_image = new unsigned char[new_size];
QueryPerformanceCounter((LARGE_INTEGER*)&tStartx);
// this loop takes about 31 ms on my laptop
// after dropping the assignment to r,g,b this dropped to 28 ms
// on an i7 3.2 gHz 64bit Win7 machine this took 6 ms
for (int i = 0; i < old_size; i+=3)
{
//b = old_image[i];
//g = old_image[i + 1];
//r = old_image[i + 2];
// I saved about 3 ms (around 10% of total time) by not assigning to variable r, g & b (above)
new_image[counter] = old_image[i + 2]; // r
new_image[counter + 1] = old_image[i + 1]; // g
new_image[counter + 2] = old_image[i]; // b
new_image[counter + 3] = a;
counter += 4;
}
QueryPerformanceCounter((LARGE_INTEGER*)&tStopx);
microsec = (unsigned long)(((tStopx - tStartx) * 1000000) / freqx);
cout << "Conversion took " << microsec << " microseconds" << endl;
}