diff --git a/setup_env.py b/setup_env.py index b9bf5fc..8a9c4b4 100644 --- a/setup_env.py +++ b/setup_env.py @@ -34,7 +34,6 @@ COMPILER_EXTRA_ARGS = { OS_EXTRA_ARGS = { "Windows":["-T", "ClangCL"], - "Linux": ["-DCMAKE_C_COMPILER=clang", "-DCMAKE_CXX_COMPILER=clang++"] } ARCH_ALIAS = { diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py index 44d2418..4d94081 100644 --- a/utils/codegen_tl2.py +++ b/utils/codegen_tl2.py @@ -105,7 +105,7 @@ inline int32_t partial_max_reset(int32_t bs, void* lut_scales_) {\n\ template\n\ inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\ #if defined __AVX2__\n\ - __m256 vec_lut[16];\n\ + __m256i vec_lut[16];\n\ const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0);\n\ float scales = *lut_scales;\n\ __m256i shuffle_mask = _mm256_set_epi8(\n\ @@ -191,7 +191,7 @@ inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_t template\n\ inline int32_t two_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\ #if defined __AVX2__\n\ - __m256 vec_lut[16];\n\ + __m256i vec_lut[16];\n\ const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0);\n\ float scales = *lut_scales;\n\ __m256i shuffle_mask = _mm256_set_epi8(\n\