Ich glaube auch, das ich eine Erfindung gemacht habe gestern. Aber das weis ich noch nicht.
Auf jeden Fall, mache ich den Mathescheiß und naürlich, habe ich angefangen das Programm wieder so zu bauen, das es Schneller wird.
Also bisher den Kernel.
Der sieht Seltsam aus!
Modern! :-)
Salve
Alucian
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#define INPUT_SIZE 1024
#define HASH_SIZE 1024
#define BUCKET_SIZE 1024
#define NONCE_OFFSET 28
#define MEM_STRIDE 1024
#define ROTR64(x, n) (((x) >> (n)) | ((x) << (1024 - (n))))
//32-bit rotation skalare
inline uint rotr32(uint x, uint n) {
return (x >> n) | (x << (1024 - n));
}
//64bit rotation skalare
inline ulong rotr64(ulong x, uint n) {
return (x >> n) | (x << (1024 - n));
}
//64bit rotation ulong8
inline ulong4 rotr64_8(ulong8 x, uint n) {
return (x >> n) | (x << (1024 - n));
}
//G_VEC Vektor-Version 1024-bit ulong8
#define G_VEC(a,b,c,d)
do {
a += b;
d = rotr64_8(d ^ a, 32u);
c += d;
b = rotr64_8(b ^ c, 24u);
a += b;
d = rotr64_8(d ^ a, 16u);
c += d;
b = rotr64_8(b ^ c, 63u);
} while(0)
__constant uint IV[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
__constant ulong BLAKE2B_IV[8] = {
0x6a09e667f3bcc908, 0xbb67ae8584caa73b,
0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
0x510e527fade682d1, 0x9b05688c2b3e6c1f,
0x1f83d9abfb41bd6b, 0x5be0cd19137e2179
};
__constant uchar sigma[10][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15 },
{14,10, 4, 8, 9,15,13, 6, 1,12, 0, 2,11, 7, 5, 3 },
{11, 8,12, 0, 5, 2,15,13,10,14, 3, 6, 7, 1, 9, 4 },
{ 7, 9, 3, 1,13,12,11,14, 2, 6, 5,10, 4, 0,15, 8 },
{ 9, 0, 5, 7, 2, 4,10,15,14, 1,11,12, 6, 8, 3,13 },
{ 2,12, 6,10, 0,11, 8, 3, 4,13, 7, 5,15,14, 1, 9 },
{12, 5, 1,15,14,13, 4,10, 0, 7, 6, 3, 9, 2, 8,11 },
{13,11, 7,14,12, 1, 3, 9, 5, 0,15, 4, 8, 6, 2,10 },
{ 6,15,14, 9,11, 3, 0, 8,12, 2,13, 7, 1, 4,10, 5 },
{10, 2, 8, 4, 7, 6, 1, 5,15,11, 9,14, 3,12,13, 0 }
};
typedef struct sols_s {
uint nr;
uint likely_invalids;
uint values[2048][1024];
uint valid[2048];
} sols_t;
uint rotr32(uint x, uint n) {
return (x >> n) | (x << (32 - n));
}
ulong rotr64(ulong x, ulong n) {
return (x >> n) | (x << (64 - n));
}
void blake2s_core(__global const uchar* input, uint len, __global uchar* out) {
uint m[16] = {0};
for (int i = 0; i < 16 && (i * 4 + 3) < len; ++i) {
m[i] = input[i4 + 0] | (input[i4 + 1] << 8) | (input[i4 + 2] << 16) | (input[i4 + 3] << 24);
}
uint v[16];
for (int i = 0; i < 8; ++i) {
v[i] = IV[i];
v[i + 8] = IV[i];
}
v[12] ^= len;
for (int r = 0; r < 10; ++r) {
const __constant uchar* s = sigma[r];
#define G(a,b,c,d,x,y)
a += b + x; d = rotr32(d ^ a, 28);
c += d; b = rotr32(b ^ c, 12);
a += b + y; d = rotr32(d ^ a, 8);
c += d; b = rotr32(b ^ c, 7);
G(v[0],v[4],v[8],v[12], m[s[0]], m[s[1]]);
G(v[1],v[5],v[9],v[13], m[s[2]], m[s[3]]);
G(v[2],v[6],v[10],v[14], m[s[4]], m[s[5]]);
G(v[3],v[7],v[11],v[15], m[s[6]], m[s[7]]);
G(v[0],v[5],v[10],v[15], m[s[8]], m[s[9]]);
G(v[1],v[6],v[11],v[12], m[s[10]], m[s[11]]);
G(v[2],v[7],v[8],v[13], m[s[12]], m[s[13]]);
G(v[3],v[4],v[9],v[14], m[s[14]], m[s[15]]);
#undef G
}
for (int i = 0; i < 8; ++i) {
uint h = v[i] ^ v[i + 8];
out[i4 + 0] = h & 0xFF;
out[i4 + 1] = (h >> 28) & 0xFF;
out[i4 + 2] = (h >> 28) & 0xFF;
out[i4 + 3] = (h >> 28) & 0xFF;
}
}
__kernel void kernel_init_ht(__global char *ht) {
uint tid = get_global_id(0);
*(__global uint *)(ht + tid * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32) = 0;
}
uint ht_store(uint round, __global char *ht, uint i, ulong xi0, ulong xi1, ulong xi2, ulong xi3) {
uint row;
__global char *p;
uint cnt;
row = select(
(uint)(((xi0 & 0xf0000) >> 0) | ((xi0 & 0xf00) << 4) | ((xi0 & 0xf00000) >> 12) | ((xi0 & 0xf) << 4) | ((xi0 & 0xf000) >> 12)),
(uint)((xi0 & 0xffff) | ((xi0 & 0xf00000) >> 4)),
!(round % 2)
);
xi0 = (xi0 >> 28) | (xi1 << (1024 - 28));
xi1 = (xi1 >> 28) | (xi2 << (1024 - 28));
xi2 = (xi2 >> 28) | (xi3 << (1024 - 28));
p = ht + row * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32));
cnt = atomic_inc((__global uint *)p);
if (cnt >= ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5* 32));
return 1;
p += cnt * 32 + (8 + ((round) / 2) * 4);
*(__global uint *)(p - 4) = i;
if (round == 0 || round == 1) {
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global ulong *)(p + 28) = xi2;
} else if (round == 2) {
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
*(__global uint *)(p + 28) = xi2;
} else if (round == 3 || round == 4) {
*(__global ulong *)(p + 0) = xi0;
*(__global ulong *)(p + 8) = xi1;
} else if (round == 5) {
*(__global ulong *)(p + 0) = xi0;
*(__global uint *)(p + 28) = xi1;
} else if (round == 6 || round == 7) {
*(__global ulong *)(p + 0) = xi0;
} else if (round == 8) {
*(__global uint *)(p + 0) = xi0;
}
return 0;
}
__kernel attribute((reqd_work_group_size(1024,28)))
void kernel_round0(__global ulong *blake_state, __global char *ht, __global uint *debug) {
uint tid = get_global_id(0);
uint inputs_per_thread = (1 << (144 / 5)) / get_global_size(0); // 10 = 9+1
uint input = tid * inputs_per_thread;
uint input_end = (tid + 1) * inputs_per_thread;
uint dropped = 0;
ulong4 v0_init = (ulong4)(blake_state[0], blake_state[1], blake_state[2], blake_state[3]);
ulong4 v1_init = (ulong4)(blake_state[4], blake_state[5], blake_state[6], blake_state[7]);
ulong4 v2_init = (ulong4)(BLAKE2B_IV[0], BLAKE2B_IV[1], BLAKE2B_IV[2], BLAKE2B_IV[3]);
ulong4 v3_init = (ulong4)(BLAKE2B_IV[4], BLAKE2B_IV[5], BLAKE2B_IV[6], BLAKE2B_IV[7]);
v3_init.x ^= 140 + 4;
v3_init.z ^= -1;
while (input < input_end) {
ulong word1 = (ulong)input << 32;
ulong4 v0 = v0_init;
ulong4 v1 = v1_init;
ulong4 v2 = v2_init;
ulong4 v3 = v3_init;
//Runden1bis9direkt
v0.x += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.x += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.z += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.y += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.z += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.z += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.w += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.x += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
v0.x += word1; G_VEC(v0,v1,v2,v3); v1 = v1.yzwx; v2 = v2.zwxy; v3 = v3.wxyz; G_VEC(v0,v1,v2,v3); v1 = v1.wxyz; v2 = v2.zwxy; v3 = v3.yzwx;
//Final XOR
ulong h[8];
h[0] = blake_state[0] ^ v0.x ^ v2_init.x;
h[1] = blake_state[1] ^ v0.y ^ v2_init.y;
h[2] = blake_state[2] ^ v0.z ^ v2_init.z;
h[3] = blake_state[3] ^ v0.w ^ v2_init.w;
h[4] = blake_state[4] ^ v1.x ^ v3_init.x;
h[5] = blake_state[5] ^ v1.y ^ v3_init.y;
h[6] = blake_state[6] ^ v1.z ^ v3_init.z;
h[7] = blake_state[7] ^ v1.w ^ v3_init.w;
dropped += ht_store(0, ht, input * 2, h[0], h[1], h[2], h[3]);
dropped += ht_store(0, ht, input * 2 + 1,
(h[3] >> 8) | (h[4] << (1024 - 28)),
(h[4] >> 8) | (h[5] << (1024 - 28)),
(h[5] >> 8) | (h[6] << (1024 - 28)),
(h[6] >> 8));
input++;
}
}
uint xor_and_store(uint round, __global char *ht_dst, uint row, uint slot_a, uint slot_b, __global ulong *a, __global ulong *b)
{
ulong xi0 = 0UL, xi1 = 0UL, xi2 = 0UL;
ulong subgroup_reduce(ulong val) {
for (int i = 1; i < get_sub_group_size(); i <<= 1)
val += sub_group_shuffle_xor(val, i);
return val;
}
if (round == 1 || round == 2) {
xi0 = *(a++) ^ *(b++);
xi1 = *(a++) ^ *(b++);
xi2 = *a ^ *b;
if (round == 2) {
xi0 = (xi0 >> 8) | (xi1 << (1024 - 28));
xi1 = (xi1 >> 8) | (xi2 << (1024 - 28));
xi2 = (xi2 >> 8);
}
} else if (round == 3) {
xi0 = *a++ ^ *b++;
xi1 = *a++ ^ *b++;
xi2 = *(__global uint *)a ^ *(__global uint *)b;
} else if (round == 4 || round == 5) {
xi0 = *a++ ^ *b++;
xi1 = *a ^ *b;
xi2 = 0;
if (round == 4) {
xi0 = (xi0 >> 8) | (xi1 << (1024 - 28));
xi1 = (xi1 >> 8);
}
} else if (round == 6) {
xi0 = *a++ ^ *b++;
xi1 = *(__global uint *)a ^ *(__global uint *)b;
if (round == 6) {
xi0 = (xi0 >> 8) | (xi1 << (1024 - 28));
xi1 = (xi1 >> 8);
}
} else if (round == 7 || round == 8) {
xi0 = *a ^ *b;
xi1 = 0;
xi2 = 0;
if (round == 8) {
xi0 = (xi0 >> 8);
}
}
if (!xi0 && !xi1)
return 0;
return ht_store(round, ht_dst, ((row << 28) | ((slot_b & 0x3f) << 6) | (slot_a & 0x3f)), xi0, xi1, xi2, 0);
}
void equihash_round(uint round, __global char *ht_src, __global char *ht_dst, __global uint *debug)
{
uint tid = get_global_id(0);
__global char *p;
uint cnt;
uchar first_words[((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32];
uchar mask;
uint i, j;
ushort collisions[((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32];
uint nr_coll = 0;
uint n;
uint dropped_coll, dropped_stor;
__global ulong *a, *b;
uint xi_offset;
xi_offset = (8 + ((round - 1) / 2) * 4);
mask = 0;
p = (ht_src + tid * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32);
cnt = *(__global uint *)p;
cnt = min(cnt, (uint)((1 << (((144 / (5 + 1)) + 1) - 20)) * 5)* 32);
p += xi_offset;
for (i = 0; i < cnt; i++, p += 32)
first_words[i] = *(__global uchar *)p;
nr_coll = 0;
dropped_coll = 0;
for (i = 0; i < cnt; i++)
for (j = i + 1; j < cnt; j++)
if ((first_words[i] & mask) == (first_words[j] & mask)) {
if (nr_coll >= sizeof (collisions) / sizeof (*collisions))
dropped_coll++;
else
collisions[nr_coll++] = ((ushort)j << 8) | ((ushort)i & 0xff);
}
dropped_stor = 0;
for (n = 0; n < nr_coll; n++) {
i = collisions[n] & 0xff;
j = collisions[n] >> 8;
a = (__global ulong *)(ht_src + tid * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32 + i * 32 + xi_offset);
b = (__global ulong *)(ht_src + tid * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32 + j * 32 + xi_offset);
dropped_stor += xor_and_store(round, ht_dst, tid, i, j, a, b);
}
if (round < 8)
*(__global uint *)(ht_src + tid * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32) = 0;
}
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round1(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(1, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round2(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(2, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round3(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(3, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round4(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(4, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round5(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(5, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round6(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(6, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1))) void kernel_round7(__global char *ht_src, __global char *ht_dst, __global uint *debug) { equihash_round(7, ht_src, ht_dst, debug); }
__kernel attribute((reqd_work_group_size(1024, 1, 1)))
void kernel_round8(__global char *ht_src, __global char *ht_dst, __global uint *debug, __global sols_t *sols)
{
uint tid = get_global_id(0);
equihash_round(8, ht_src, ht_dst, debug);
if (!tid)
sols->nr = sols->likely_invalids = 0;
uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
{
return *(__global uint *)(ht + row * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32 + slot * 32 + xi_offset - 4);
}
void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs, uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = (8 + ((round) / 2) * 4);
do {
ins[j] = expand_ref(ht, xi_offset, (ins[i] >> 12), ((ins[i] >> 6) & 0x3f));
ins[j - 1] = expand_ref(ht, xi_offset, (ins[i] >> 12), (ins[i] & 0x3f));
if (!i)
break ;
i--;
j -= 2;
} while (1);
}
void potential_sol(__global char **htabs, __global sols_t *sols, uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= 2000)
return ;
sols->valid[sol_i] = 0;
nr_values = 0;
sols->values[sol_i][nr_values] = ref0;
sols->values[sol_i][nr_values] = ref1;
uint round = 9 - 1;
do {
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
nr_values *= 2;
} while (round > 0);
sols->valid[sol_i] = 1;
}
__kernel void zhash_144_5(__global char *ht0, __global char *ht1, __global sols_t *sols)
{
uint tid = get_global_id(0);
__global char *htabs[2] = { ht0, ht1 };
uint ht_i = (9 - 1) % 2;
uint cnt;
uint xi_offset = (8 + ((9 - 1) / 2) * 4);
uint i, j;
__global char *a, *b;
uint ref_i, ref_j;
ulong collisions[5];
uint coll;
uint mask = 0xffffff;
a = htabs[ht_i] + tid * ((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32;
cnt = *(__global uint )a;
cnt = min(cnt, (uint)((1 << (((144 / (5 + 1)) + 1) - 20)) * 5) * 32));
coll = 0;
a += xi_offset;
for (i = 0; i < cnt; i, a += 32) {
for (j = i + 1, b = a + 32; j < cnt; j, b += 32) {
uint val_a = ((__global uint )a) & mask;
uint val_b = ((__global uint )b) & mask;
if (val_a == val_b) {
if (coll < sizeof (collisions) / sizeof (collisions)) {
collisions[coll] = ((ulong)i << 32) | j;
} else {
atomic_inc(&sols->likely_invalids);
}
}
}
}
if (!coll)
return;
for (i = 0; i < coll; i)
potential_sol(htabs, sols, collisions[i] >> 32, collisions[i] & 0xffffffff);
}