OpenCL compiler has failed to compile a function
I try to porting function on the OpenCL kernel, but compiler gave me next error:
cvmsErrorCompilerFailure: LLVM compiler has failed to compile a function.
Here is my kernel code:
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
//##############################################################################
// Utils function
//##############################################################################
void mmemcpy(unsigned char *dst, const unsigned char *src, size_t len) {
for (size_t i = 0; i < len; i++)
dst[i] = src[i];
}
//##############################################################################
// Tree hashing constants definition
//##############################################################################
/* tweak word T[1]: bit field starting positions */
#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */
#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) /* bits 112..118: level in hash tree */
#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) /* bit 119 : partial final input byte */
#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */
#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */
#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */
/* tweak word T[1]: flag bit definition(s) */
#define SKEIN_T1_FLAG_FIRST (((unsigned long) 1 ) << SKEIN_T1_POS_FIRST)
#define SKEIN_T1_FLAG_FINAL (((unsigned long) 1 ) << SKEIN_T1_POS_FINAL)
#define SKEIN_T1_FLAG_BIT_PAD (((unsigned long) 1 ) << SKEIN_T1_POS_BIT_PAD)
//##############################################################################
// Skein macros
//##############################################################################
#ifndef RotL_64
#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64-(N))))
#endif
//##############################################################################
// Skein block
//##############################################################################
#ifndef SKEIN_USE_ASM
#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
#endif
#ifndef SKEIN_LOOP
#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
#endif
#define BLK_BITS (WCNT*64) /* some useful definitions for code here */
#define KW_TWK_BASE (0)
#define KW_KEY_BASE (3)
#define ks (kw + KW_KEY_BASE)
#define ts (kw + KW_TWK_BASE)
//##############################################################################
// Port settings
//##############################################################################
/* Platform settings */
#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */
#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
/* Define platfor here */
#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
#if PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
/* here for big-endian */
#define SKEIN_NEED_SWAP (1)
#elif PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
/* here for little-endian */
#define SKEIN_NEED_SWAP (0)
#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */
#define Skein_Put64_LSB_First(dst08, src64, bCnt) mmemcpy(dst08, src64, bCnt)
#define Skein_Get64_LSB_First(dst64, src08, wCnt) mmemcpy(dst64, src08, 8*(wCnt))
#endif
#endif
//##############################################################################
// Skein.h
//##############################################################################
enum {
/* Skein_256 round rotation constants */
R_256_0_0=14, R_256_0_1=16,
R_256_1_0=52, R_256_1_1=57,
R_256_2_0=23, R_256_2_1=40,
R_256_3_0= 5, R_256_3_1=37,
R_256_4_0=25, R_256_4_1=33,
R_256_5_0=46, R_256_5_1=12,
R_256_6_0=58, R_256_6_1=22,
R_256_7_0=32, R_256_7_1=32,
/* Skein_512 round rotation constants */
R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
/* Skein1024 round rotation constants */
R1024_0_0=24, R1024_0_1=13, R1024_0_2= 8, R1024_0_3=47, R1024_0_4= 8, R1024_0_5=17, R1024_0_6=22, R1024_0_7=37,
R1024_1_0=38, R1024_1_1=19, R1024_1_2=10, R1024_1_3=55, R1024_1_4=49, R1024_1_5=18, R1024_1_6=23, R1024_1_7=52,
R1024_2_0=33, R1024_2_1= 4, R1024_2_2=51, R1024_2_3=13, R1024_2_4=34, R1024_2_5=41, R1024_2_6=59, R1024_2_7=17,
R1024_3_0= 5, R1024_3_1=20, R1024_3_2=48, R1024_3_3=41, R1024_3_4=47, R1024_3_5=28, R1024_3_6=16, R1024_3_7=25,
R1024_4_0=41, R1024_4_1= 9, R1024_4_2=37, R1024_4_3=31, R1024_4_4=12, R1024_4_5=47, R1024_4_6=44, R1024_4_7=30,
R1024_5_0=16, R1024_5_1=34, R1024_5_2=56, R1024_5_3=51, R1024_5_4= 4, R1024_5_5=53, R1024_5_6=42, R1024_5_7=41,
R1024_6_0=31, R1024_6_1=44, R1024_6_2=47, R1024_6_3=46, R1024_6_4=19, R1024_6_5=42, R1024_6_6=44, R1024_6_7=25,
R1024_7_0= 9, R1024_7_1=48, R1024_7_2=35, R1024_7_3=52, R1024_7_4=23, R1024_7_5=31, R1024_7_6=37, R1024_7_7=20
};
#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */
#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian)*/
#endif
#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((unsigned long) (hi32)) << 32))
#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION,SKEIN_ID_STRING_LE)
#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
//##############################################################################
// Skein structures
//##############################################################################
enum {
SKEIN_SUCCESS = 0, /* return codes from Skein calls */
SKEIN_FAIL = 1,
SKEIN_BAD_HASHLEN = 2
};
#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */
#define SKEIN_256_STATE_WORDS ( 4)
#define SKEIN_512_STATE_WORDS ( 8)
#define SKEIN1024_STATE_WORDS (16)
#define SKEIN_MAX_STATE_WORDS (16)
#define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_STATE_BITS (64*SKEIN_256_STATE_WORDS)
#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS)
#define SKEIN1024_STATE_BITS (64*SKEIN1024_STATE_WORDS)
#define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)
#define SKEIN_256_ROUNDS_TOTAL (72) /* number of rounds for the different block sizes */
#define SKEIN_512_ROUNDS_TOTAL (72)
#define SKEIN1024_ROUNDS_TOTAL (80)
typedef struct {
unsigned long hashBitLen; /* size of hash result, in bits */
unsigned long bCnt; /* current byte count in buffer b[] */
unsigned long T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
} Skein_Ctxt_Hdr_t;
typedef struct { /* 256-bit Skein hash context structure */
Skein_Ctxt_Hdr_t h; /* common header context variables */
unsigned long X[SKEIN_256_STATE_WORDS]; /* chaining variables */
unsigned char b[SKEIN_256_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
} Skein_256_Ctxt_t;
//##############################################################################
// 256-bit Skein
//##############################################################################
/***************************** Skein_256 ******************************/
void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const unsigned *blkPtr, unsigned long blkCnt, unsigned long byteCntAdd) { /* do it in C */
enum {
WCNT = SKEIN_256_STATE_WORDS
};
#undef RCNT
#define RCNT (SKEIN_256_ROUNDS_TOTAL/8)
#ifdef SKEIN_LOOP /* configure how much to unroll the loop */
#define SKEIN_UNROLL_256 (((SKEIN_LOOP)/100)%10)
#else
#define SKEIN_UNROLL_256 (0)
#endif
#if SKEIN_UNROLL_256
#if (RCNT % SKEIN_UNROLL_256)
#error "Inv开发者_开发知识库alid SKEIN_UNROLL_256" /* sanity check on unroll count */
#endif
unsigned long r;
unsigned long kw[WCNT+4+RCNT*2]; /* key schedule words : chaining vars + tweak + "rotation"*/
#else
unsigned long kw[WCNT+4]; /* key schedule words : chaining vars + tweak */
#endif
unsigned long X0, X1, X2, X3; /* local copy of context vars, for speed */
unsigned long w[WCNT]; /* local copy of input block */
/* never call with blkCnt == 0! */
if (!(blkCnt != 0))
return;
ts[0] = ctx->h.T[0];
ts[1] = ctx->h.T[1];
do {
/* this implementation only supports 2**64 input bytes (no carry out here) */
ts[0] += byteCntAdd; /* update processed length */
/* precompute the key schedule for this block */
ks[0] = ctx->X[0];
ks[1] = ctx->X[1];
ks[2] = ctx->X[2];
ks[3] = ctx->X[3];
ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
ts[2] = ts[0] ^ ts[1];
Skein_Get64_LSB_First(w, blkPtr, WCNT); /* get input block in little-endian format */
X0 = w[0] + ks[0]; /* do the first full key injection */
X1 = w[1] + ks[1] + ts[0];
X2 = w[2] + ks[2] + ts[1];
X3 = w[3] + ks[3];
blkPtr += SKEIN_256_BLOCK_BYTES;
/* run the rounds */
// IN THIS MACRO WHEN I EXCHANGED X##p1 and X##p in RotL_64 for numbers 1, 2, THEN COMPILATION IS OK
#define Round256(p0, p1, p2, p3, ROT, rNum) \
X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
#if SKEIN_UNROLL_256 == 0
#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
Round256(p0, p1, p2, p3, ROT, rNum)
#define I256(R) \
X0 += ks[((R)+1) % 5]; /* inject the key schedule value */ \
X1 += ks[((R)+2) % 5] + ts[((R)+1) % 3]; \
X2 += ks[((R)+3) % 5] + ts[((R)+2) % 3]; \
X3 += ks[((R)+4) % 5] + (R)+1;
#else /* looping version */
#define R256(p0, p1, p2, p3, ROT, rNum) \
Round256(p0, p1, p2, p3, ROT, rNum) \
#define I256(R) \
X0 += ks[r+(R)+0]; /* inject the key schedule value */ \
X1 += ks[r+(R)+1] + ts[r+(R)+0]; \
X2 += ks[r+(R)+2] + ts[r+(R)+1]; \
X3 += ks[r+(R)+3] + r+(R) ; \
ks[r + (R)+4 ] = ks[r+(R)-1]; /* rotate key schedule */\
ts[r + (R)+2 ] = ts[r+(R)-1];
for (r = 1; r < 2*RCNT; r += 2*SKEIN_UNROLL_256) /* loop thru it */
#endif
{
#define R256_8_rounds(R) \
R256(0,1,2,3,R_256_0,8*(R) + 1); \
R256(0,3,2,1,R_256_1,8*(R) + 2); \
R256(0,1,2,3,R_256_2,8*(R) + 3); \
R256(0,3,2,1,R_256_3,8*(R) + 4); \
I256(2*(R)); \
R256(0,1,2,3,R_256_4,8*(R) + 5); \
R256(0,3,2,1,R_256_5,8*(R) + 6); \
R256(0,1,2,3,R_256_6,8*(R) + 7); \
R256(0,3,2,1,R_256_7,8*(R) + 8); \
I256(2*(R)+1);
R256_8_rounds( 0);
#define R256_Unroll_R(NN) ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_256 > (NN)))
#if R256_Unroll_R( 1)
R256_8_rounds( 1);
#endif
#if R256_Unroll_R( 2)
R256_8_rounds( 2);
#endif
#if R256_Unroll_R( 3)
R256_8_rounds( 3);
#endif
#if R256_Unroll_R( 4)
R256_8_rounds( 4);
#endif
#if R256_Unroll_R( 5)
R256_8_rounds( 5);
#endif
#if R256_Unroll_R( 6)
R256_8_rounds( 6);
#endif
#if R256_Unroll_R( 7)
R256_8_rounds( 7);
#endif
#if R256_Unroll_R( 8)
R256_8_rounds( 8);
#endif
#if R256_Unroll_R( 9)
R256_8_rounds( 9);
#endif
#if R256_Unroll_R(10)
R256_8_rounds(10);
#endif
#if R256_Unroll_R(11)
R256_8_rounds(11);
#endif
#if R256_Unroll_R(12)
R256_8_rounds(12);
#endif
#if R256_Unroll_R(13)
R256_8_rounds(13);
#endif
#if R256_Unroll_R(14)
R256_8_rounds(14);
#endif
#if (SKEIN_UNROLL_256 > 14)
#error "need more unrolling in Skein_256_Process_Block"
#endif
}
// WHEN I COMMENT NEXT 4 LINES, THEN COMPILATION IS OK
ctx->X[0] = X0 ^ w[0];
ctx->X[1] = X1 ^ w[1];
ctx->X[2] = X2 ^ w[2];
ctx->X[3] = X3 ^ w[3];
ts[1] &= ~SKEIN_T1_FLAG_FIRST;
}
while (--blkCnt);
ctx->h.T[0] = ts[0];
ctx->h.T[1] = ts[1];
}
__kernel void update( __global Skein_256_Ctxt_t *gctx) {
int glId = get_global_id(0);
int lId = get_local_id(0);
int grId = get_group_id(0);
// TODO now is one hashState per block, try one hs per thread to accelerate computing
gctx += grId;
Skein_256_Ctxt_t ctx;
ctx.h.hashBitLen = 256;
Skein_256_Process_Block(&ctx, &ctx.b, 1, SKEIN_256_BLOCK_BYTES);
*gctx = ctx;
}
When I comment marked line (I wrote this line in capital letters) or change the values (not both) the compilation is ok. Can anyone help me?
It seems to me that you forgot to specify the type of memory you are dealing with i.e __global
or __local
OpenCL might think you are dealing with host RAM memory. By the way consider rewriting your program from scratch, do that step by step. You cant just take the ordinary c code and compile it with opencl.
精彩评论