Ok, here is the full source for a minimal example that shows both (broken) GPU and (working) CPU versions side by side. The full cpp code is only about 100 lines for each implementation, and I have annotated which blocks of code are for CPU and GPU. There is also a flag, cpu_enable, at the top that controls if the CPU version is done - I have made no effort to make the CPU implementation efficient, so it is incredibly slow.
The cpp code:
#include "stdafx.h"
#include <SFML/Graphics.hpp>
int main(int argc, char *argv[])
{
unsigned int defaultSize = 160;
bool cpu_enable = true;
//Window
//GPU:
sf::RenderWindow window(sf::VideoMode(defaultSize, defaultSize), "SFML test");
//CPU:
sf::RenderWindow cpu_Window(sf::VideoMode(defaultSize, defaultSize), "CPU");
window.setFramerateLimit(60);
//Draw layer
//GPU
sf::RenderTexture drawRenderTexture;
drawRenderTexture.create(window.getSize().x, window.getSize().y);
drawRenderTexture.clear(sf::Color::Transparent);
sf::Sprite drawSprite;
drawSprite.setTexture(drawRenderTexture.getTexture());
sf::RenderTexture blurRenderTexture;
blurRenderTexture.create(drawRenderTexture.getSize().x, drawRenderTexture.getSize().y);
blurRenderTexture.clear(sf::Color::Transparent);
sf::Sprite blurSprite;
blurSprite.setTexture(blurRenderTexture.getTexture());
sf::Shader drawBlur;
if (!drawBlur.loadFromFile("drawBlur.frag", sf::Shader::Fragment)) {
return EXIT_FAILURE;
}
if (!drawBlur.isAvailable()) {
return EXIT_FAILURE;
}
drawBlur.setUniform("texture", sf::Shader::CurrentTexture);
drawBlur.setUniform("xResolution", float(1.0f / float(drawRenderTexture.getSize().x)));
drawBlur.setUniform("yResolution", float(1.0f / float(drawRenderTexture.getSize().y)));
//CPU
sf::Image cpu_drawImage;
cpu_drawImage.create(cpu_Window.getSize().x, cpu_Window.getSize().y, sf::Color::Transparent);
sf::Image cpu_blurImage;
cpu_blurImage.create(cpu_drawImage.getSize().x, cpu_drawImage.getSize().y, sf::Color::Transparent);
sf::Texture cpu_texture;
cpu_texture.create(cpu_blurImage.getSize().x, cpu_blurImage.getSize().y);
sf::Sprite cpu_drawSprite;
cpu_drawSprite.setTexture(cpu_texture);
//Main loop:
int iFrame = 0;
while (window.isOpen())
{
//Input:
sf::Event event;
while (window.pollEvent(event))
{
if (event.type == sf::Event::Closed) {
window.close();
}
if (event.type == sf::Event::Resized) {
window.setView(sf::View(sf::FloatRect(0.f, 0.f, window.getSize().x, window.getSize().y)));
}
}
//Logic:
//Draw circling trail
double radius = 0.7 * double(window.getSize().x) / 2.0;
double prec = 1.0;
double angle = double(iFrame % int(360 * prec)) / prec;
double degreesToRadians = std::acos(-1) / 180.0;
sf::Vertex drawVertices[1000];
int nVertices = 0;
for (int iX = 0; iX < 10; ++iX) {
for (int iY = 0; iY < 10; ++iY) {
sf::Color drawColour = sf::Color::White;
float xLoc = window.getSize().x / 2 + radius * cos(angle * degreesToRadians) + iX;
float yLoc = window.getSize().y / 2 + radius * sin(angle * degreesToRadians) + iY;
//GPU:
drawVertices[nVertices++] = sf::Vertex(sf::Vector2f(xLoc, yLoc), drawColour);
//CPU:
cpu_drawImage.setPixel(unsigned int(xLoc), unsigned int(yLoc), drawColour);
}
}
//GPU:
drawRenderTexture.draw(drawVertices, nVertices, sf::PrimitiveType::Points);
drawRenderTexture.display();
blurRenderTexture.clear(sf::Color::Transparent);//Don't need to clear, as when using sf::BlendNone the original is entirely overdrawn
blurRenderTexture.draw(drawSprite, sf::RenderStates(sf::BlendNone, sf::Transform(), NULL, &drawBlur));
blurRenderTexture.display();
//Cycle texture back to image:
drawRenderTexture.clear(sf::Color::Transparent);//Don't need to clear, as when using sf::BlendNone the original is entirely overdrawn
drawRenderTexture.draw(blurSprite, sf::RenderStates(sf::BlendNone));
//TODO: Why doesn't it dissipate? Seems to get to a certain distance and minimal density, then neither get more spread out, nor get less dense...??
//CPU:
//Draw blurred version of the image to the blur texture:
if (cpu_enable) {
for (unsigned int x = 0; x < cpu_drawImage.getSize().x; ++x) {
for (unsigned int y = 0; y < cpu_drawImage.getSize().y; ++y) {
float weightCentre = 1.0;//TODO: Could it be an issue of float precision when numbers are summed together and then divided at the end?
float weightAdj = 0.5;
float weightDiag = 0.25;
float weights[3] = {weightCentre, weightAdj, weightDiag};
float alphaWeight = 0.0;
float colourWeight = 0.0;
float colour[4] = {0.0, 0.0, 0.0, 0.0};
for (int dx = -1; dx <= 1; ++dx) {
for (int dy = -1; dy <= 1; ++dy) {
unsigned int tX = x + dx;
unsigned int tY = y + dy;
sf::Color inColour(0, 0, 0, 0);
if (tX < 0 || tX >= cpu_drawImage.getSize().x || tY < 0 || tY >= cpu_drawImage.getSize().y) {
//Out of bounds defaults to transparent black
} else {
inColour = cpu_drawImage.getPixel(tX, tY);
}
float texColour[4] = {float(inColour.r) / float(255.0), float(inColour.g) / float(255.0), float(inColour.b) / float(255.0), float(inColour.a) / float(255.0)};
int ds = dx*dx + dy*dy;//This works nicely because numbers have magnitude 0 or 1, but won't stretch further
float weight = weights[ds];
float effectiveWeight = texColour[3] * weight;
colour[3] += effectiveWeight;
alphaWeight += weight;
colour[0] += texColour[0] * effectiveWeight;
colour[1] += texColour[1] * effectiveWeight;
colour[2] += texColour[2] * effectiveWeight;
colourWeight += effectiveWeight;
}
}
//Make certain we never divide by zero
colour[3] = alphaWeight > 0.0 ? colour[3] / alphaWeight : 0.0;
colour[0] = colourWeight > 0.0 ? colour[0] / colourWeight : colour[0];
colour[1] = colourWeight > 0.0 ? colour[1] / colourWeight : colour[1];
colour[2] = colourWeight > 0.0 ? colour[2] / colourWeight : colour[2];
sf::Color outColour(sf::Uint8(colour[0] * 255), sf::Uint8(colour[1] * 255), sf::Uint8(colour[2] * 255), sf::Uint8(colour[3] * 255));
//Apply to blur image:
cpu_blurImage.setPixel(x, y, outColour);
}
}
//copy the blurred version back to the original image:
//(could probably get double the performance if we alternated which image we drew to/from rather than doing a copy, but performance doesn't matter here: only correctness)
for (unsigned int x = 0; x < cpu_drawImage.getSize().x; ++x) {
for (unsigned int y = 0; y < cpu_drawImage.getSize().y; ++y) {
cpu_drawImage.setPixel(x, y, cpu_blurImage.getPixel(x, y));
}
}
//Push blurred image to renderable texture:
cpu_texture.update(cpu_blurImage);
//Render the cpu blurred image:
cpu_Window.clear();
cpu_Window.draw(cpu_drawSprite);
cpu_Window.display();
} else {
//Dont render Render the cpu blurred image:
cpu_Window.clear(sf::Color::Red);
cpu_Window.display();
}
//Rendering:
window.clear();
window.draw(blurSprite);
window.display();
if (iFrame % 60 == 0) {
printf("%d\n", iFrame);
}
++iFrame;
}
return EXIT_SUCCESS;
}
and the shader:
uniform sampler2D texture;
uniform float xResolution;
uniform float yResolution;
void main()
{
vec2 offx = vec2(xResolution, 0.0);
vec2 offy = vec2(0.0, yResolution);
float weightCentre = 1.0;//TODO: Could it be an issue of float precision when numbers are summed together and then divided at the end?
float weightAdj = 0.5;
float weightDiag = 0.25;
float weights[3] = {weightCentre, weightAdj, weightDiag};
//Alpha weighted colour blending: (maybe there is some built in way to achieve what I'm doing here trivially)
float alphaWeight = 0.0;
float colourWeight = 0.0;
vec4 colour = vec4(0.0, 0.0, 0.0, 0.0);
for(int dx = -1; dx <= 1; ++dx) {
for(int dy = -1; dy <= 1; ++dy) {
vec4 texColour = texture2D(texture, gl_TexCoord[0].xy + float(dx) * offx + float(dy) * offy);
int ds = dx*dx + dy*dy;//This works nicely because numbers have magnitude 0 or 1, but won't stretch further
float weight = weights[ds];
float effectiveWeight = texColour.a * weight;
colour.a += effectiveWeight;
alphaWeight += weight;
colour.rgb += texColour.rgb * effectiveWeight;
colourWeight += effectiveWeight;
}
}
//Make certain we never divide by zero
colour.a = alphaWeight > 0.0 ? colour.a / alphaWeight : 0.0;
colour.rgb = colourWeight > 0.0 ? colour.rgb / colourWeight : colour.rgb;
gl_FragColor = colour;
}