gr-fosphor/lib/fosphor/display.cl

314 lines
8.6 KiB
C

/*
* display.cl
*
* Display OpenCL kernel. Formats the raw FFT output into GL objects we
* use for display.
*
* Copyright (C) 2013-2014 Sylvain Munaut
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* NOTE (to make it clear): For the purpose of this license, any software
* making use of this kernel (or derivative thereof) is considered to be
* a derivative work (i.e. "a work based on the Program").
*/
/* Enable or not use of NV SM11 histogram algo (set automatically) */
/* #define USE_NV_SM11_ATOMICS */
/* Enable or not the use of cl_khr_local_int32_base_atomics to
* implement atomic add (set automatically) */
/* #define USE_EXT_ATOMICS */
#ifdef USE_EXT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#endif
#define CLAMP
//#define MAX_HOLD_LIVE
//#define MAX_HOLD_HISTO
//#define MAX_HOLD_NORMAL
#define MAX_HOLD_DECAY
#ifdef USE_NV_SM11_ATOMICS
#define LOG2_WARP_SIZE 5U
#define UINT_BITS 32U
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
#define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )
/* See NVidia OpenCL Histogram256 example for how/why this works */
inline void nv_sm11_atomic_inc(volatile __local uint *p, uint tag)
{
uint count;
do {
count = *p & TAG_MASK;
count = tag | (count + 1);
*p = count;
} while (*p != count);
}
#endif /* USE_NV_SM11_ATOMICS */
__attribute__((reqd_work_group_size(16, 16, 1)))
__kernel void display(
/* FFT Input */
__global const float2 *fft, /* [ 0] Input FFT (complex) */
const uint fft_log2_len, /* [ 1] log2(FFT length) */
const uint fft_batch, /* [ 2] # spectrums in the input */
/* Waterfall */
__write_only image2d_t wf_tex, /* [ 3] Texture handle */
const uint wf_offset, /* [ 4] Y Offset in the texture */
/* Histogram */
__read_only image2d_t histo_tex_r, /* [ 5] Texture read handle */
__write_only image2d_t histo_tex_w, /* [ 6] Texture write handle */
const float histo_t0r, /* [ 7] Rise time constant */
const float histo_t0d, /* [ 8] Decay time constant */
const float histo_scale, /* [ 9] Val->Bin: scaling */
const float histo_ofs, /* [10] Val->Bin: offset */
/* Live spectrum */
__global float2 *spectrum_vbo, /* [11] Vertex Buffer Object */
const float live_alpha) /* [12] Averaging time constant */
{
int gidx;
float max_pwr = - 1000.0f;
/* Local memory */
__local float live_buf[16 * 16]; /* get_local_size(0) * get_local_size(1) */
__local float max_buf[16 * 16]; /* get_local_size(0) * get_local_size(1) */
__local uint histo_buf[16 * 128];
/* Local shortcuts */
const float live_one_minus_alpha = 1.0f - live_alpha;
/* Transposition & Atomic emulation */
#ifdef USE_NV_SM11_ATOMICS
__local float pwr_buf[16 * 16]; /* pwr transpose buffer */
uint tib = (get_local_id(0) + get_local_id(1)) & 15;
uint ti0 = tib | (get_local_id(0) << 4);
uint ti1 = tib | (get_local_id(1) << 4);
const uint tag = get_local_id(0) << (UINT_BITS - LOG2_WARP_SIZE);
#endif
/* Clear buffers */
live_buf[get_local_id(1) * get_local_size(0) + get_local_id(0)] = 0.0f;
__local uint *h = &histo_buf[get_local_id(1) * get_local_size(0) + get_local_id(0)];
h[ 0] = 0;
h[ 256] = 0;
h[ 512] = 0;
h[ 768] = 0;
h[1024] = 0;
h[1280] = 0;
h[1536] = 0;
h[1792] = 0;
/* Wait for all clears to be done by everyone */
barrier(CLK_LOCAL_MEM_FENCE);
/* Main loop */
for (gidx=0; gidx<fft_batch; gidx+=get_local_size(1))
{
/* Read fft & compute power */
int fft_idx = ((gidx + get_local_id(1)) << fft_log2_len) + get_global_id(0);
float2 fft_value = fft[fft_idx];
float pwr = log10(hypot(fft_value.x,fft_value.y));
/* Maximum pwr */
max_pwr = max(max_pwr, pwr);
/* Write to Waterfall texture */
int2 coord;
coord.x = get_global_id(0);
coord.y = (get_local_id(1) + wf_offset + gidx) & (get_image_height(wf_tex) - 1);
write_imagef(wf_tex, coord, (float4)(pwr, 0.0f, 0.0f, 0.0f));
/* Add to Live Spectrum buffer */
live_buf[get_local_id(1) * get_local_size(0) + get_local_id(0)] +=
pwr * native_powr(live_one_minus_alpha, (float)(fft_batch - gidx - get_local_id(1) - 1));
#ifdef USE_NV_SM11_ATOMICS
/* Transposition */
barrier(CLK_LOCAL_MEM_FENCE); /* Sync */
pwr_buf[ti0] = pwr; /* Store power */
barrier(CLK_LOCAL_MEM_FENCE); /* Sync */
pwr = pwr_buf[ti1]; /* Read power */
#endif
/* Map to bin */
int bin = (int)round(histo_scale * (pwr + histo_ofs));
if (bin < 0 || bin > 127)
#ifdef CLAMP
bin = (bin < 0) ? 0 : 127;
#else
continue;
#endif
/* Atomic Bin increment */
#if defined(USE_NV_SM11_ATOMICS)
nv_sm11_atomic_inc(&histo_buf[(bin << 4) + get_local_id(1)], tag);
#elif defined(USE_EXT_ATOMICS)
atom_inc(&histo_buf[(bin << 4) + get_local_id(0)]);
#else
atomic_inc(&histo_buf[(bin << 4) + get_local_id(0)]);
#endif
}
max_buf[get_local_id(1) * get_local_size(0) + get_local_id(0)] = max_pwr;
/* Wait for everyone before the final merges */
barrier(CLK_LOCAL_MEM_FENCE);
/* Live Spectrum merging */
__global float2 *live_vbo = &spectrum_vbo[0];
if (get_global_id(1) == 0)
{
int i,n;
float sum;
float2 vertex;
/* Compute sum */
sum = 0.0f;
for (i=0; i<get_local_size(1); i++)
sum += live_buf[i * get_local_size(0) + get_local_id(0)];
/* Position in spectrum */
n = get_global_size(0) >> 1;
i = get_global_id(0) ^ n;
/* Compute vertex position */
vertex = live_vbo[i];
if (!isfinite(vertex.y)) /* Safety if previous val is weird */
vertex.y = sum / get_local_size(1);
vertex.x = ((float)i / (float)n) - 1.0f;
vertex.y = vertex.y * native_powr(live_one_minus_alpha, (float)fft_batch) +
sum * live_alpha;
live_vbo[i] = vertex;
}
/* Histogram merging */
for (gidx=0; gidx<128; gidx+=get_local_size(1))
{
const sampler_t direct_sample = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP_TO_EDGE;
/* Histogram coordinates */
int2 coord;
coord.x = get_global_id(0);
coord.y = gidx + get_local_id(1);
/* Fetch previous histogram value */
float4 hv = read_imagef(histo_tex_r, direct_sample, coord);
/* Fetch hit count */
uint hc = histo_buf[coord.y * get_local_size(0) + get_local_id(0)]
#ifdef USE_NV_SM11_ATOMICS
& TAG_MASK
#endif
;
/* Fast exit if possible ... */
if ((hv.x <= 0.01f) && (hc == 0))
continue;
/* Apply the rise / decay */
float a = (float)hc / (float)fft_batch;
float b = a * native_recip(histo_t0r);
float c = b + native_recip(histo_t0d);
float d = b * native_recip(c);
float e = native_powr(1.0f - c, (float)fft_batch);
hv.x = (hv.x - d) * e + d;
/* Clamp value (we don't clear the texture so we get crap) */
hv.x = clamp(hv.x, 0.0f, 1.0f);
/* Write new histogram value */
write_imagef(histo_tex_w, coord, hv);
}
/* Max hold */
__global float2 *max_vbo = &spectrum_vbo[1 << fft_log2_len];
if (get_global_id(1) == 0)
{
int i, j, n;
float2 vertex;
#ifdef MAX_HOLD_HISTO
vertex.y = - histo_ofs;
for (gidx=0; gidx<128; gidx++)
{
const sampler_t direct_sample = CLK_NORMALIZED_COORDS_FALSE | CLK_FILTER_NEAREST | CLK_ADDRESS_CLAMP_TO_EDGE;
/* Histogram coordinates */
int2 coord;
coord.x = get_global_id(0);
coord.y = gidx;
/* Fetch histogram value */
float4 hv = read_imagef(histo_tex_r, direct_sample, coord);
/* Set vertex position */
if (hv.x > 0.1f)
vertex.y = ((float)gidx / histo_scale)- histo_ofs;
}
#endif
/* Position in spectrum */
n = get_global_size(0) >> 1;
i = get_global_id(0) ^ n;
max_pwr = max_vbo[i].y;
if (!isfinite(max_pwr))
max_pwr = - MAXFLOAT; /* Will be replaced by max() below */
vertex.x = ((float)i / (float)n) - 1.0f;
#ifdef MAX_HOLD_LIVE
vertex.y = max(live_vbo[i].y, max_pwr);
#endif
#ifdef MAX_HOLD_NORMAL
for (i=0; i<get_local_size(1); i++)
max_pwr = max(max_pwr, max_buf[i * get_local_size(0) + get_local_id(0)]);
vertex.y = max_pwr;
#endif
#ifdef MAX_HOLD_DECAY
max_pwr = max_pwr * 0.999f + 0.001f * live_vbo[i].y;
for (j=0; j<get_local_size(1); j++)
max_pwr = max(max_pwr, max_buf[j * get_local_size(0) + get_local_id(0)]);
vertex.y = max_pwr;
#endif
max_vbo[i] = vertex;
}
}
/* vim: set syntax=c: */