#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
static const CONSTANT UINT64 arrPrecalc_post_l27[4*256] =
{
0x00C300C0C6C183C6ULL, 0x291C611919323578ULL, 0xF631FE3635C48FCBULL, 0x9F443F4747DE6B78ULL, 0x6439343C3E589D0AULL, 0x4DC6D5C5C08B6B15ULL, 0x4008200808481028ULL, 0x12790A7A786CF372ULL, 0x69B6C1B1B4DB6775ULL, 0xCD35153537FA8822ULL, 0xF6937E969065CDEEULL, 0xDFAF9FAFA877B837ULL, 0x7BCFCBCBCCB79407ULL, 0xFB7D0B7B7986F672ULL, 0xE48BF48C8B69FF7FULL, 0x5292AA9297C5203DULL, 0x3B04EB03023E07E9ULL, 0x4DE655E5E1AB2BB4ULL, 0x64DAB4DCD9B95E6DULL, 0x2470147474500C60ULL, 0x8D1DB51D1E92D8ABULL, 0x006180606360C1E3ULL, 0xFBDF8BDBDC27B457ULL, 0xA9AEA1A9AC03570DULL, 0x40EBA0E8EFA9D34FULL, 0x9F053F07059EEA3AULL, 0xB6BBDEBEB90D9D67ULL, 0xF6117E1614E4CF6AULL, 0x6975C171721AE4B3ULL, 0x12598A5A594CB3D3ULL, 0x1FF6FFF7F2EF090DULL, 0x12BA0ABABEAD70B4ULL, 0x40AAA0A8ADE9520DULL, 0xFBBE0BBBBF4775B4ULL, 0x29BEE1B9BC93775DULL, 0xDF2D9F2F2CF6BAB3ULL, 0x64FA34FCF8991ECCULL, 0x9F873F87811FE8BEULL, 0x5251AA525104A3FBULL, 0xF6D27ED6D2254CACULL, 0xD2C3EAC2C515812FULL, 0x36481E4E4E7C7E50ULL, 0xA94D21494BE2946AULL, 0x8051405052D0A112ULL, 0x0D8FF58D8A83FA7FULL, 0x4D05D505064AE8D3ULL, 0xFB5D8B5B58A6B6D3ULL, 0xE429742C2EC8BD5AULL, 0x0DEE75EDE9E33B9CULL, 0x92EB4AEAEC7DD1A6ULL, 0x0082008084810284ULL, 0x9FA7BFA7A03FA81FULL, 0xB6585E5E5EEC5E00ULL, 0x69F7C1F1F69BE637ULL, 0x4028A02829685089ULL, 0xFB3C0B3B3BC67730ULL, 0x9F25BF2724BEAA9BULL, 0x293CE139381275D9ULL, 0x9208CA0A0B9C12C1ULL, 0x6458B45C5D385CE9ULL, 0x8D3D353D3FB2980AULL, 0x7BEF4BEBED97D4A6ULL, 0xE4AB74ACAA49BFDEULL, 0x76C23EC6C2B56CFCULL, 0x5FDE5FDFDB875984ULL, 0x1F74FF77766E0B89ULL, 0x3BE76BE3E5DFC48EULL, 0xBB342B33338E6718ULL, 0x36EA9EEEEBDD3C75ULL, 0xC0BAE0B8BD79725DULL, 0x12380A3A3A2C7230ULL, 0x24F214F4F0D10EE4ULL, 0xF6507E5656A44E28ULL, 0xA96DA1696AC2D4CBULL, 0x8DFE35FDF9731BCCULL, 0xC038E03839F870D9ULL, 0x0D6C756D6D623918ULL, 0x52D3AAD2D585A17FULL, 0x6419B41C1F78DDABULL, 0x299E61999DB337FCULL, 0xFB1C8B1B1AE63791ULL, 0x8010401010902050ULL, 0xCD549555549A49C1ULL, 0x7B8ECB8B8EF71545ULL, 0x76403E4646346E78ULL, 0x40CB20C8CE8993EEULL, 0x9249CA4A49DC9383ULL, 0x36CA1ECECAFD7CD4ULL, 0x1F977F97918FC8EEULL, 0x5F1D5F1F1D46DA42ULL, 0x12DB8ADADDCDB157ULL, 0xA440D44445E06C91ULL, 0x24D294D4D1F14E45ULL, 0x6934C131305A65F1ULL, 0x6996419195FB27D4ULL, 0x4D87D58582CBEA57ULL, 0xDF4C1F4F4F967B50ULL, 0xC079E0787BB8F19BULL, 0x36AB9EAEA99DBD37ULL, 0xA401D40407A0EDD3ULL, 0xCDF615F5F13B0BE4ULL, 0x36689E6E6F5C3EF1ULL, 0xE9A681A1A44B4725ULL, 0xD241EA42419483ABULL, 0x5FFEDFFFFAA71925ULL, 0xBBD7ABD3D46FA47FULL, 0xA483D4848321EF57ULL, 0x12FB0AFAFCEDF1F6ULL, 0xE904010101EA0500ULL, 0x3B45EB43407E86ABULL, 0x80D340D0D651A396ULL, 0xCD74157575BA0960ULL, 0x4DA755A5A3EBAAF6ULL, 0xD2A26AA2A67540CCULL, 0x649BB49C9BF9DF2FULL, 0x69144111117A2550ULL, 0x3B246B23231E4748ULL, 0xDF8F1F8F8957F896ULL, 0xFB9E8B9B9E673515ULL, 0x80924090941122D4ULL, 0xD200EA0203D402E9ULL, 0x0041004042408142ULL, 0xB6195E1E1CACDF42ULL, 0x9F64BF6766FE2BD9ULL, 0xB6DA5EDEDA6D5C84ULL, 0x0D0DF50D0E02F8FBULL, 0xE468746C6C883C18ULL, 0xA9EFA1E9EE43D64FULL, 0x64BB34BCBAD99F8EULL, 0x0000000000000000ULL, 0xB678DE7E7FCC1EA1ULL, 0x295D61595B72B43AULL, 0xBBB62BB3B70F659CULL, 0x241194141730CD83ULL, 0xF6B3FEB6B1458D4FULL, 0xC0DB60D8DE19B3BEULL, 0x0DAF75ADABA3BADEULL, 0x8D7C357D7DF21948ULL, 0xBB55AB5350EEA6FBULL, 0xD282EA828755006DULL, 0x92284A2A2ABC5260ULL, 0x5F9F5F9F99C7D8C6ULL, 0xE9860181856B0784ULL, 0x5F7CDF7F7E261BA1ULL, 0x29FFE1F9FED3F61FULL, 0x92AA4AAAAE3D50E4ULL, 0xCD15951516DAC883ULL, 0x1F547F57574E4B28ULL, 0x7B2C4B2B2B565760ULL, 0xCDD695D5D01B4B45ULL, 0x297DE1797A52F49BULL, 0x76013E060474EF3AULL, 0xF6F2FEF6F3050C0DULL, 0x00E380E0E7E1C367ULL, 0xDFEE9FEFEA373975ULL, 0x92694A6A68FCD322ULL, 0x2493949493B1CF07ULL, 0x7BAE4BABAFD755E4ULL, 0x8030C03031B060F1ULL, 0x2450945455704CC1ULL, 0xCD979595925BCA07ULL, 0x1F35FF37342E8ACBULL, 0x8071C07073F0E1B3ULL, 0xC0FBE0F8FF39F31FULL, 0xBB14AB1312AE27B9ULL, 0x76A3BEA6A1D5AD1FULL, 0x4D44D545440A6991ULL, 0x24B314B4B2918FA6ULL, 0xDFCE1FCFCB1779D4ULL, 0x36299E2E2D1CBFB3ULL, 0xA92CA12928825589ULL, 0x92CBCACACD5D9107ULL, 0xA42154242680AD72ULL, 0xBB96AB93962F253DULL, 0xA9CF21C9CF6396EEULL, 0xD2616A6260B4C30AULL, 0x52F32AF2F4A5E1DEULL, 0x3BC7EBC3C4FF842FULL, 0xE9C701C1C72B86C6ULL, 0xA460546464C02C30ULL, 0x368B1E8E88BDFD96ULL, 0x0D2D752D2F22B85AULL, 0x1F157F17150ECA6AULL, 0x7B0CCB0B0A7617C1ULL, 0x8D9FB59D9A13DA2FULL, 0xA90C210909A21528ULL, 0xA4E254E4E0412EB4ULL, 0x80F3C0F0F771E337ULL, 0x12188A1A1B0C3291ULL, 0xB639DE3E3D8C9FE3ULL, 0x9FE6BFE7E27F295DULL, 0x00208020212040A1ULL, 0x928ACA8A8F1D1045ULL, 0xE4CAF4CCC9297E3DULL, 0x7B6D4B6B6916D622ULL, 0x4069A0686B28D1CBULL, 0x1FD67FD7D3CF49ACULL, 0xB6FADEFEFB4D1C25ULL, 0xBBF72BF3F54FE4DEULL, 0xE924812120CA45A1ULL, 0x29DF61D9DFF3B6BEULL, 0x8D5CB55D5CD259E9ULL, 0xF670FE7677840E89ULL, 0x408A20888CC912ACULL, 0x6478347C7C181C48ULL, 0x52712A727024E35AULL, 0x8DBF35BDBB339A8EULL, 0x5F5C5F5F5F065B00ULL, 0xE9E781E1E60BC667ULL, 0xE409F40C0FE8FDFBULL, 0xA4A354A4A201AFF6ULL, 0x5FBFDFBFB8E79867ULL, 0x0DCEF5CDC8C37B3DULL, 0xC09A60989C5932FCULL, 0xE9658161628AC4E3ULL, 0xB69B5E9E982DDDC6ULL, 0x5F3DDF3F3C669AE3ULL, 0xBB752B7371CEE65AULL, 0x7621BE262554AF9BULL, 0xC018601818D83078ULL, 0x52B22AB2B6E5609CULL, 0xD2206A2222F44248ULL, 0xFBFF0BFBFD07F4F6ULL, 0x0D4CF54D4C4279B9ULL, 0x69554151533AA412ULL, 0x8DDEB5DDD8535B6DULL, 0x36091E0E0C3CFF12ULL, 0x129A8A9A9F8D3015ULL, 0x76E2BEE6E3952C5DULL, 0xC05960585A98B13AULL, 0xE4EA74ECE8093E9CULL, 0x3B656B63615EC60AULL, 0xA98E21898D2317ACULL, 0x3BA66BA3A79F45CCULL, 0x1FB7FFB7B0AF884FULL, 0x52302A3232646218ULL, 0x4D645565652A2930ULL, 0xE448F44C4DA87CB9ULL, 0x00A280A0A5A14225ULL, 0xDF0D1F0F0DD6FA12ULL, 0x80B2C0B0B5316275ULL, 0x69D741D1D7BBA696ULL, 0x7B4DCB4B48369683ULL, 0x404920484A08916AULL, 0x3B86EB8386BF056DULL, 0xCDB715B5B37B8AA6ULL, 0x76833E8680F5EDBEULL, 0xDF6C9F6F6EB63BF1ULL, 0x7660BE6667142ED9ULL, 0x5210AA12134422B9ULL, 0xA4C2D4C4C1616E15ULL, 0x4D255525276AA872ULL, 0xE945014143AA8442ULL, 0xD2E36AE2E435C18EULL, 0x9FC63FC7C35F69FCULL, 0x2431143436108D22ULL,
};
static const CONSTANT UINT64 arrPrecalc_post_r3[4*256] =
{
0xC183C600C300C0C6ULL, 0x323578291C611919ULL, 0xC48FCBF631FE3635ULL, 0xDE6B789F443F4747ULL, 0x589D0A6439343C3EULL, 0x8B6B154DC6D5C5C0ULL, 0x4810284008200808ULL, 0x6CF37212790A7A78ULL, 0xDB677569B6C1B1B4ULL, 0xFA8822CD35153537ULL, 0x65CDEEF6937E9690ULL, 0x77B837DFAF9FAFA8ULL, 0xB794077BCFCBCBCCULL, 0x86F672FB7D0B7B79ULL, 0x69FF7FE48BF48C8BULL, 0xC5203D5292AA9297ULL, 0x3E07E93B04EB0302ULL, 0xAB2BB44DE655E5E1ULL, 0xB95E6D64DAB4DCD9ULL, 0x500C602470147474ULL, 0x92D8AB8D1DB51D1EULL, 0x60C1E30061806063ULL, 0x27B457FBDF8BDBDCULL, 0x03570DA9AEA1A9ACULL, 0xA9D34F40EBA0E8EFULL, 0x9EEA3A9F053F0705ULL, 0x0D9D67B6BBDEBEB9ULL, 0xE4CF6AF6117E1614ULL, 0x1AE4B36975C17172ULL, 0x4CB3D312598A5A59ULL, 0xEF090D1FF6FFF7F2ULL, 0xAD70B412BA0ABABEULL, 0xE9520D40AAA0A8ADULL, 0x4775B4FBBE0BBBBFULL, 0x93775D29BEE1B9BCULL, 0xF6BAB3DF2D9F2F2CULL, 0x991ECC64FA34FCF8ULL, 0x1FE8BE9F873F8781ULL, 0x04A3FB5251AA5251ULL, 0x254CACF6D27ED6D2ULL, 0x15812FD2C3EAC2C5ULL, 0x7C7E5036481E4E4EULL, 0xE2946AA94D21494BULL, 0xD0A1128051405052ULL, 0x83FA7F0D8FF58D8AULL, 0x4AE8D34D05D50506ULL, 0xA6B6D3FB5D8B5B58ULL, 0xC8BD5AE429742C2EULL, 0xE33B9C0DEE75EDE9ULL, 0x7DD1A692EB4AEAECULL, 0x8102840082008084ULL, 0x3FA81F9FA7BFA7A0ULL, 0xEC5E00B6585E5E5EULL, 0x9BE63769F7C1F1F6ULL, 0x6850894028A02829ULL, 0xC67730FB3C0B3B3BULL, 0xBEAA9B9F25BF2724ULL, 0x1275D9293CE13938ULL, 0x9C12C19208CA0A0BULL, 0x385CE96458B45C5DULL, 0xB2980A8D3D353D3FULL, 0x97D4A67BEF4BEBEDULL, 0x49BFDEE4AB74ACAAULL, 0xB56CFC76C23EC6C2ULL, 0x8759845FDE5FDFDBULL, 0x6E0B891F74FF7776ULL, 0xDFC48E3BE76BE3E5ULL, 0x8E6718BB342B3333ULL, 0xDD3C7536EA9EEEEBULL, 0x79725DC0BAE0B8BDULL, 0x2C723012380A3A3AULL, 0xD10EE424F214F4F0ULL, 0xA44E28F6507E5656ULL, 0xC2D4CBA96DA1696AULL, 0x731BCC8DFE35FDF9ULL, 0xF870D9C038E03839ULL, 0x6239180D6C756D6DULL, 0x85A17F52D3AAD2D5ULL, 0x78DDAB6419B41C1FULL, 0xB337FC299E61999DULL, 0xE63791FB1C8B1B1AULL, 0x9020508010401010ULL, 0x9A49C1CD54955554ULL, 0xF715457B8ECB8B8EULL, 0x346E7876403E4646ULL, 0x8993EE40CB20C8CEULL, 0xDC93839249CA4A49ULL, 0xFD7CD436CA1ECECAULL, 0x8FC8EE1F977F9791ULL, 0x46DA425F1D5F1F1DULL, 0xCDB15712DB8ADADDULL, 0xE06C91A440D44445ULL, 0xF14E4524D294D4D1ULL, 0x5A65F16934C13130ULL, 0xFB27D46996419195ULL, 0xCBEA574D87D58582ULL, 0x967B50DF4C1F4F4FULL, 0xB8F19BC079E0787BULL, 0x9DBD3736AB9EAEA9ULL, 0xA0EDD3A401D40407ULL, 0x3B0BE4CDF615F5F1ULL, 0x5C3EF136689E6E6FULL, 0x4B4725E9A681A1A4ULL, 0x9483ABD241EA4241ULL, 0xA719255FFEDFFFFAULL, 0x6FA47FBBD7ABD3D4ULL, 0x21EF57A483D48483ULL, 0xEDF1F612FB0AFAFCULL, 0xEA0500E904010101ULL, 0x7E86AB3B45EB4340ULL, 0x51A39680D340D0D6ULL, 0xBA0960CD74157575ULL, 0xEBAAF64DA755A5A3ULL, 0x7540CCD2A26AA2A6ULL, 0xF9DF2F649BB49C9BULL, 0x7A25506914411111ULL, 0x1E47483B246B2323ULL, 0x57F896DF8F1F8F89ULL, 0x673515FB9E8B9B9EULL, 0x1122D48092409094ULL, 0xD402E9D200EA0203ULL, 0x4081420041004042ULL, 0xACDF42B6195E1E1CULL, 0xFE2BD99F64BF6766ULL, 0x6D5C84B6DA5EDEDAULL, 0x02F8FB0D0DF50D0EULL, 0x883C18E468746C6CULL, 0x43D64FA9EFA1E9EEULL, 0xD99F8E64BB34BCBAULL, 0x0000000000000000ULL, 0xCC1EA1B678DE7E7FULL, 0x72B43A295D61595BULL, 0x0F659CBBB62BB3B7ULL, 0x30CD832411941417ULL, 0x458D4FF6B3FEB6B1ULL, 0x19B3BEC0DB60D8DEULL, 0xA3BADE0DAF75ADABULL, 0xF219488D7C357D7DULL, 0xEEA6FBBB55AB5350ULL, 0x55006DD282EA8287ULL, 0xBC526092284A2A2AULL, 0xC7D8C65F9F5F9F99ULL, 0x6B0784E986018185ULL, 0x261BA15F7CDF7F7EULL, 0xD3F61F29FFE1F9FEULL, 0x3D50E492AA4AAAAEULL, 0xDAC883CD15951516ULL, 0x4E4B281F547F5757ULL, 0x5657607B2C4B2B2BULL, 0x1B4B45CDD695D5D0ULL, 0x52F49B297DE1797AULL, 0x74EF3A76013E0604ULL, 0x050C0DF6F2FEF6F3ULL, 0xE1C36700E380E0E7ULL, 0x373975DFEE9FEFEAULL, 0xFCD32292694A6A68ULL, 0xB1CF072493949493ULL, 0xD755E47BAE4BABAFULL, 0xB060F18030C03031ULL, 0x704CC12450945455ULL, 0x5BCA07CD97959592ULL, 0x2E8ACB1F35FF3734ULL, 0xF0E1B38071C07073ULL, 0x39F31FC0FBE0F8FFULL, 0xAE27B9BB14AB1312ULL, 0xD5AD1F76A3BEA6A1ULL, 0x0A69914D44D54544ULL, 0x918FA624B314B4B2ULL, 0x1779D4DFCE1FCFCBULL, 0x1CBFB336299E2E2DULL, 0x825589A92CA12928ULL, 0x5D910792CBCACACDULL, 0x80AD72A421542426ULL, 0x2F253DBB96AB9396ULL, 0x6396EEA9CF21C9CFULL, 0xB4C30AD2616A6260ULL, 0xA5E1DE52F32AF2F4ULL, 0xFF842F3BC7EBC3C4ULL, 0x2B86C6E9C701C1C7ULL, 0xC02C30A460546464ULL, 0xBDFD96368B1E8E88ULL, 0x22B85A0D2D752D2FULL, 0x0ECA6A1F157F1715ULL, 0x7617C17B0CCB0B0AULL, 0x13DA2F8D9FB59D9AULL, 0xA21528A90C210909ULL, 0x412EB4A4E254E4E0ULL, 0x71E33780F3C0F0F7ULL, 0x0C329112188A1A1BULL, 0x8C9FE3B639DE3E3DULL, 0x7F295D9FE6BFE7E2ULL, 0x2040A10020802021ULL, 0x1D1045928ACA8A8FULL, 0x297E3DE4CAF4CCC9ULL, 0x16D6227B6D4B6B69ULL, 0x28D1CB4069A0686BULL, 0xCF49AC1FD67FD7D3ULL, 0x4D1C25B6FADEFEFBULL, 0x4FE4DEBBF72BF3F5ULL, 0xCA45A1E924812120ULL, 0xF3B6BE29DF61D9DFULL, 0xD259E98D5CB55D5CULL, 0x840E89F670FE7677ULL, 0xC912AC408A20888CULL, 0x181C486478347C7CULL, 0x24E35A52712A7270ULL, 0x339A8E8DBF35BDBBULL, 0x065B005F5C5F5F5FULL, 0x0BC667E9E781E1E6ULL, 0xE8FDFBE409F40C0FULL, 0x01AFF6A4A354A4A2ULL, 0xE798675FBFDFBFB8ULL, 0xC37B3D0DCEF5CDC8ULL, 0x5932FCC09A60989CULL, 0x8AC4E3E965816162ULL, 0x2DDDC6B69B5E9E98ULL, 0x669AE35F3DDF3F3CULL, 0xCEE65ABB752B7371ULL, 0x54AF9B7621BE2625ULL, 0xD83078C018601818ULL, 0xE5609C52B22AB2B6ULL, 0xF44248D2206A2222ULL, 0x07F4F6FBFF0BFBFDULL, 0x4279B90D4CF54D4CULL, 0x3AA4126955415153ULL, 0x535B6D8DDEB5DDD8ULL, 0x3CFF1236091E0E0CULL, 0x8D3015129A8A9A9FULL, 0x952C5D76E2BEE6E3ULL, 0x98B13AC05960585AULL, 0x093E9CE4EA74ECE8ULL, 0x5EC60A3B656B6361ULL, 0x2317ACA98E21898DULL, 0x9F45CC3BA66BA3A7ULL, 0xAF884F1FB7FFB7B0ULL, 0x64621852302A3232ULL, 0x2A29304D64556565ULL, 0xA87CB9E448F44C4DULL, 0xA1422500A280A0A5ULL, 0xD6FA12DF0D1F0F0DULL, 0x31627580B2C0B0B5ULL, 0xBBA69669D741D1D7ULL, 0x3696837B4DCB4B48ULL, 0x08916A404920484AULL, 0xBF056D3B86EB8386ULL, 0x7B8AA6CDB715B5B3ULL, 0xF5EDBE76833E8680ULL, 0xB63BF1DF6C9F6F6EULL, 0x142ED97660BE6667ULL, 0x4422B95210AA1213ULL, 0x616E15A4C2D4C4C1ULL, 0x6AA8724D25552527ULL, 0xAA8442E945014143ULL, 0x35C18ED2E36AE2E4ULL, 0x5F69FC9FC63FC7C3ULL, 0x108D222431143436ULL,
};
#ifdef __OPENCL_VERSION__
#define X64 uint2
#define ASX64(v) (as_uint2(v))
#else
#define X64 UINT64
#define ASX64(v) (v)
#endif
#define Toff8(off8) (*(const LOCAL UINT64*)&(((const LOCAL UINT8*)TAll_local)[off8]))
#define TCoff8(off8) (*(const CONSTANT UINT64*)&(((const CONSTANT UINT8*)arrPrecalc_post)[off8]))
#define Toff8_r3(off8) (*(const LOCAL UINT64*)&(((const LOCAL UINT8*)TAll_local_r3)[off8]))
#define TCoff8_r3(off8) (*(const CONSTANT UINT64*)&(((const CONSTANT UINT8*)arrPrecalc_post_r3)[off8]))
#define TCoff8_l27(off8) (*(const CONSTANT UINT64*)&(((const CONSTANT UINT8*)arrPrecalc_post_l27)[off8]))
#define TC0off8_l27(off8) (*(const CONSTANT UINT64*)&(((const CONSTANT UINT8*)0)[off8]))
#ifdef __OPENCL_VERSION__
#define LUT0_r3(v) ASX64(Toff8_r3(v.x & 0x7F8))
#define LUT1_r3(v) ASX64(ROL64(Toff8_r3((v.x >> 8) & 0x7F8U), 8))
#define LUT2_r3(v) ASX64(ROL64(Toff8_r3((v.x >> 16) & 0x7F8U), 16));
#define LUT3_r3(v) ASX64(TC0off8_l27(bitselect(baseL27, (UINT32)(as_ulong(v) >> 24), 0x7F8U)))
#define LUT4_r3(v) (ASX64(Toff8_r3((v.y) & 0x7F8U))).yx
#define LUT5_r3(v) (ASX64(ROL64(Toff8_r3((v.y >> 8) & 0x7F8U), 8))).yx
#define LUT6_r3(v) (ASX64(ROL64(Toff8_r3((v.y >> 16) & 0x7F8U), 16))).yx;
#define LUT7_r3(v) (ASX64(TC0off8_l27(bitselect(baseL27, amd_bitalign(v.x, v.y, 24), 0x7F8U)))).yx
#else
#define LUT0_r3(v) ASX64(Toff8_r3((v) & 0x7F8))
#define LUT1_r3(v) ASX64(ROL64(Toff8_r3((v >> 8) & 0x7F8U), 8))
#define LUT2_r3(v) ASX64(ROL64(Toff8_r3((v >> 16) & 0x7F8U), 16))
#define LUT3_r3(v) ASX64(ROL64(Toff8_r3((v >> 24) & 0x7F8U), 24))
#define LUT4_r3(v) ASX64(ROL64(Toff8_r3((v >> 32) & 0x7F8U), 32))
#define LUT5_r3(v) ASX64(ROL64(Toff8_r3((v >> 40) & 0x7F8U), 40))
#define LUT6_r3(v) ASX64(ROL64(Toff8_r3((v >> 48) & 0x7F8U), 48))
#define LUT7_r3(v) ASX64(ROL64(Toff8_r3(ROR64(v, 56) & 0x7F8U), 56))
#endif
KERNEL void vanilla0_post(
GLOBAL UINT32 *pOut32,
GLOBAL const UINT32 *pIn32
#ifdef __OPENCL_VERSION__
, UINT32 nMask
#else
, UINT32 GLOBALID
#endif
)
{
#ifdef __OPENCL_VERSION__
#else
UINT32 nMask = 0xFFFFFFFF;
#endif
GLOBAL const UINT64 *arrMidstate = (GLOBAL const UINT64 *)pIn32;
#define baseL27 ((UINT32)&arrPrecalc_post_l27[0])
LOCAL X64 TAll_local_r3[256*1];
{
UINT32 nLocalId = LOCALID;
for(unsigned i = 0; i < 256; i += WORKSIZE)
{
TAll_local_r3[i + nLocalId]
= ASX64(arrPrecalc_post_r3[i + nLocalId]);
}
}
GLOBAL const UINT64 *post_pStatesPre64_pr3 = arrMidstate;
X64 stateAX64_pr3[8];
X64 stateBX64_pr3[8];
for(unsigned i = 0; i < 8; ++i)
{
stateAX64_pr3[i] = ASX64(arrMidstate[64+i]);
}
X64 post_pre5_pr3 = ASX64(arrMidstate[64+8+0]);
X64 post_pre6_pr3 = ASX64(arrMidstate[64+8+1]);
X64 post_pre7_pr3 = ASX64(arrMidstate[64+8+2]);
X64 post_pre0_pr3 = ASX64(arrMidstate[64+8+3]);
#ifdef __OPENCL_VERSION__
X64 post_nonceXored_pr3 = ASX64(arrMidstate[64+8+4]);
post_nonceXored_pr3.x ^= GLOBALID >> 29;
post_nonceXored_pr3.y ^= GLOBALID << 3;
#else
X64 post_tmp_pr3 = ASX64(arrMidstate[64+8+4]);
UINT32 post_preHWA32_pr0 = ROR64(post_tmp_pr3, 3) & 0xFFFFFFFF;
X64 post_nonceXored_pr3 = post_tmp_pr3 ^ ROR64(((UINT64)GLOBALID), 29); // & 0x00FFFF00
#endif
post_pre5_pr3 ^= LUT4_r3(ASX64(post_nonceXored_pr3));
stateAX64_pr3[7&(5+0)] ^= LUT0_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+1)] ^= LUT1_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+2)] ^= LUT2_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+3)] ^= LUT3_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+4)] ^= LUT4_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+5)] ^= LUT5_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+6)] ^= LUT6_r3(post_pre5_pr3);
stateAX64_pr3[7&(5+7)] ^= LUT7_r3(post_pre5_pr3);
post_pre6_pr3 ^= LUT5_r3(ASX64(post_nonceXored_pr3));
stateAX64_pr3[7&(6+0)] ^= LUT0_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+1)] ^= LUT1_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+2)] ^= LUT2_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+3)] ^= LUT3_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+4)] ^= LUT4_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+5)] ^= LUT5_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+6)] ^= LUT6_r3(post_pre6_pr3);
stateAX64_pr3[7&(6+7)] ^= LUT7_r3(post_pre6_pr3);
post_pre7_pr3 ^= LUT6_r3(ASX64(post_nonceXored_pr3));
stateAX64_pr3[7&(7+0)] ^= LUT0_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+1)] ^= LUT1_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+2)] ^= LUT2_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+3)] ^= LUT3_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+4)] ^= LUT4_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+5)] ^= LUT5_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+6)] ^= LUT6_r3(post_pre7_pr3);
stateAX64_pr3[7&(7+7)] ^= LUT7_r3(post_pre7_pr3);
post_pre0_pr3 ^= LUT7_r3(ASX64(post_nonceXored_pr3));
stateAX64_pr3[7&(0+0)] ^= LUT0_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+1)] ^= LUT1_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+2)] ^= LUT2_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+3)] ^= LUT3_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+4)] ^= LUT4_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+5)] ^= LUT5_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+6)] ^= LUT6_r3(post_pre0_pr3);
stateAX64_pr3[7&(0+7)] ^= LUT7_r3(post_pre0_pr3);
{
for(unsigned round = 0; round < 3; round++)
{
for(unsigned j = 0; j < 8; j++)
{
stateBX64_pr3[j] = ASX64(post_pStatesPre64_pr3[round*16 + 0 +j]);
}
for(unsigned p = 0; p < 8; ++p)
{
stateBX64_pr3[7&(p+0)] ^= LUT0_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+1)] ^= LUT1_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+2)] ^= LUT2_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+3)] ^= LUT3_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+4)] ^= LUT4_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+5)] ^= LUT5_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+6)] ^= LUT6_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+7)] ^= LUT7_r3(stateAX64_pr3[p]);
}
for(unsigned j = 0; j < 8; j++)
{
stateAX64_pr3[j] = ASX64(post_pStatesPre64_pr3[round*16 + 8 +j]);
}
for(unsigned p = 0; p < 8; ++p)
{
stateAX64_pr3[7&(p+0)] ^= LUT0_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+1)] ^= LUT1_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+2)] ^= LUT2_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+3)] ^= LUT3_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+4)] ^= LUT4_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+5)] ^= LUT5_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+6)] ^= LUT6_r3(stateBX64_pr3[p]);
stateAX64_pr3[7&(p+7)] ^= LUT7_r3(stateBX64_pr3[p]);
}
}
for(unsigned j = 0; j < 8; j++)
{
stateBX64_pr3[j] = ASX64(post_pStatesPre64_pr3[6*8 + 0 +j]);
}
for(unsigned p = 0; p < 8; ++p)
{
stateBX64_pr3[7&(p+0)] ^= LUT0_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+1)] ^= LUT1_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+2)] ^= LUT2_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+3)] ^= LUT3_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+4)] ^= LUT4_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+5)] ^= LUT5_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+6)] ^= LUT6_r3(stateAX64_pr3[p]);
stateBX64_pr3[7&(p+7)] ^= LUT7_r3(stateAX64_pr3[p]);
}
// by result row; 3,5
X64 result64_pr3 = LUT0_r3(stateBX64_pr3[7&(8+3-0)]);
result64_pr3 ^= LUT1_r3(stateBX64_pr3[7&(8+3-1)]);
result64_pr3 ^= LUT2_r3(stateBX64_pr3[7&(8+3-2)]);
result64_pr3 ^= LUT3_r3(stateBX64_pr3[7&(8+3-3)]);
result64_pr3 ^= LUT4_r3(stateBX64_pr3[7&(8+3-4)]);
result64_pr3 ^= LUT5_r3(stateBX64_pr3[7&(8+3-5)]);
result64_pr3 ^= LUT6_r3(stateBX64_pr3[7&(8+3-6)]);
result64_pr3 ^= LUT7_r3(stateBX64_pr3[7&(8+3-7)]);
result64_pr3 ^= LUT0_r3(stateBX64_pr3[7&(8+5-0)]);
result64_pr3 ^= LUT1_r3(stateBX64_pr3[7&(8+5-1)]);
result64_pr3 ^= LUT2_r3(stateBX64_pr3[7&(8+5-2)]);
result64_pr3 ^= LUT3_r3(stateBX64_pr3[7&(8+5-3)]);
result64_pr3 ^= LUT4_r3(stateBX64_pr3[7&(8+5-4)]);
result64_pr3 ^= LUT5_r3(stateBX64_pr3[7&(8+5-5)]);
result64_pr3 ^= LUT6_r3(stateBX64_pr3[7&(8+5-6)]);
result64_pr3 ^= LUT7_r3(stateBX64_pr3[7&(8+5-7)]);
#ifdef __OPENCL_VERSION__
X64 post_tmp_pr3 = ASX64(arrMidstate[64+8+4]);
UINT32 post_preHWA32_pr0 = amd_bitalign(post_tmp_pr3.y, post_tmp_pr3.x, 3);
if(!(nMask & (post_preHWA32_pr0 ^ amd_bitalign(result64_pr3.x, result64_pr3.y, 3))))
{
pOut32[atom_inc(&pOut32[0]) + 1] = GLOBALID;
}
#else
post_preHWA32_pr0 ^= ROR64(result64_pr3, 32+3);
if(!(post_preHWA32_pr0 & nMask))
{
pOut32[++pOut32[0]] = GLOBALID;
}
#endif
}
}