You seem to be doing everything RIGHT! I have no idea why this wouldn't work for you
I organise gpu_conf like this, but I really don't think this should matter. For double threads It would look like this
"gpu_threads_conf" : [
{ "index" : 0, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 0, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 1, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 1, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 2, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 2, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 3, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true},
{ "index" : 3, "intensity" : 432, "worksize" : 8, "affine_to_cpu" : false, "strided_index" : 1, "mem_chunk" : 2, "comp_mode" : true}
],