diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7a0c99..6ddaa51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,10 @@ if (GGML_BITNET_X86_TL2)
     add_compile_definitions(GGML_BITNET_X86_TL2)
 endif()
 
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    add_compile_options(-fpermissive)
+endif()
+
 find_package(Threads REQUIRED)
 
 add_subdirectory(src)
diff --git a/setup_env.py b/setup_env.py
index b9bf5fc..8a9c4b4 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -34,7 +34,6 @@ COMPILER_EXTRA_ARGS = {
 
 OS_EXTRA_ARGS = {
     "Windows":["-T", "ClangCL"],
-    "Linux": ["-DCMAKE_C_COMPILER=clang", "-DCMAKE_CXX_COMPILER=clang++"]
 }
 
 ARCH_ALIAS = {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9cead70..bac8459 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -4,7 +4,7 @@ set(GGML_SOURCES_BITNET ggml-bitnet-lut.cpp)
 
 include_directories(3rdparty/llama.cpp/ggml/include)
 
-if ((NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") OR
-(NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang"))
-    message(FATAL_ERROR "Clang is required for Bitnet.cpp compilation")
-endif()
\ No newline at end of file
+if (NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_C_COMPILER_ID STREQUAL "GNU") OR
+    NOT (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+    message(FATAL_ERROR "Clang or GCC is required for Bitnet.cpp compilation")
+endif()
diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py
index 44d2418..4d94081 100644
--- a/utils/codegen_tl2.py
+++ b/utils/codegen_tl2.py
@@ -105,7 +105,7 @@ inline int32_t partial_max_reset(int32_t bs, void* lut_scales_) {\n\
 template<int act_k>\n\
 inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\
 #if defined __AVX2__\n\
-    __m256 vec_lut[16];\n\
+    __m256i vec_lut[16];\n\
     const __m256i vec_bi = _mm256_set_epi32(84, 72, 60, 48, 36, 24, 12, 0);\n\
     float scales = *lut_scales;\n\
     __m256i shuffle_mask = _mm256_set_epi8(\n\
@@ -191,7 +191,7 @@ inline int32_t three_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_t
 template<int act_k>\n\
 inline int32_t two_lut_ctor(int8_t* qlut, bitnet_float_type* b, bitnet_float_type* lut_scales) {\n\
 #if defined __AVX2__\n\
-    __m256 vec_lut[16];\n\
+    __m256i vec_lut[16];\n\
     const __m256i vec_bi = _mm256_set_epi32(56, 48, 40, 32, 24, 16, 8, 0);\n\
     float scales = *lut_scales;\n\
     __m256i shuffle_mask = _mm256_set_epi8(\n\