microsoft · vladimirmoushkov · Mar 31, 2026 · Mar 31, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,6 +16,7 @@ option(BITNET_ARM_TL1    "bitnet.cpp: use tl1 on arm platform"    OFF)
 option(BITNET_X86_TL2    "bitnet.cpp: use tl2 on x86 platform"    OFF)
 
 
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)

diff --git a/include/bitnet-lut-kernels.h b/include/bitnet-lut-kernels.h
diff --git a/include/kernel_config.ini b/include/kernel_config.ini
@@ -0,0 +1,28 @@
+[Kernels_0]
+m = 5120
+k = 5120
+bm = 256
+bk = 96
+bmm = 32
+
+[Kernels_1]
+m = 1024
+k = 5120
+bm = 256
+bk = 96
+bmm = 32
+
+[Kernels_2]
+m = 27648
+k = 5120
+bm = 256
+bk = 96
+bmm = 32
+
+[Kernels_3]
+m = 5120
+k = 27648
+bm = 128
+bk = 96
+bmm = 32
+
diff --git a/src/ggml-bitnet-mad.cpp b/src/ggml-bitnet-mad.cpp
@@ -808,7 +808,7 @@ void ggml_vec_dot_i2_i8_s_Nx1(int n, float * s, size_t bs, const void * vx, size
             accu[iy] = _mm256_setzero_si256();
         }
 
-        int8_t * y_col = y + col * by;
+        const int8_t * y_col = y + col * by;
 
         for (int i = 0; i < group32_num; i++) {
             const uint8_t *px = x + i * 1024;

diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py
@@ -690,7 +690,11 @@ def get_three_k_two_k(K, bk):
         "Llama3-8B-1.58-100B-tokens"        : [[14336, 4096],
                                                [4096, 14336],
                                                [1024, 4096],
-                                               [4096, 4096]] 
+                                               [4096, 4096]],
+        "Qwen2.5-32B-TL2"                  : [[5120, 5120],
+                                               [1024, 5120],
+                                               [27648, 5120],
+                                               [5120, 27648]]
     }
 
     parser = argparse.ArgumentParser(description='gen impl')

diff --git a/windows-tl2-build-fixes.patch b/windows-tl2-build-fixes.patch
@@ -0,0 +1,103 @@
+diff --git a/common/common.cpp b/common/common.cpp
+index 451307b5..20542501 100644
+--- a/common/common.cpp
++++ b/common/common.cpp
+@@ -16,6 +16,7 @@
+ #include <cmath>
+ #include <codecvt>
+ #include <cstdarg>
++#include <chrono>
+ #include <cstring>
+ #include <ctime>
+ #include <fstream>
+diff --git a/common/log.cpp b/common/log.cpp
+index 04c7c0ed..aea209fc 100644
+--- a/common/log.cpp
++++ b/common/log.cpp
+@@ -1,5 +1,6 @@
+ #include "log.h"
+
++#include <chrono>
+ #include <condition_variable>
+ #include <cstdarg>
+ #include <cstdio>
+diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
+index d1ff3e8b..bb4f8d9a 100644
+--- a/examples/imatrix/imatrix.cpp
++++ b/examples/imatrix/imatrix.cpp
+@@ -3,6 +3,7 @@
+ #include "log.h"
+ #include "llama.h"
+
++#include <chrono>
+ #include <cmath>
+ #include <cstdio>
+ #include <cstring>
+diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
+index efb41b80..2f821e0c 100644
+--- a/examples/perplexity/perplexity.cpp
++++ b/examples/perplexity/perplexity.cpp
+@@ -6,6 +6,7 @@
+ #include <algorithm>
+ #include <array>
+ #include <atomic>
++#include <chrono>
+ #include <cmath>
+ #include <cstdio>
+ #include <cstring>
+diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
+index 1b425ede..dad3dfff 100644
+--- a/ggml/CMakeLists.txt
++++ b/ggml/CMakeLists.txt
+@@ -186,7 +186,7 @@ endif()
+ set(CMAKE_C_STANDARD 11)
+ set(CMAKE_C_STANDARD_REQUIRED true)
+
+-if (GGML_SYCL)
++if (GGML_SYCL OR GGML_BITNET_X86_TL2)
+     set(CMAKE_CXX_STANDARD 17)
+ else()
+     set(CMAKE_CXX_STANDARD 11)
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index 121f72da..bad7f8d4 100644
+--- a/ggml/src/ggml.c
++++ b/ggml/src/ggml.c
+@@ -12686,7 +12686,7 @@ static void ggml_compute_forward_mul_mat(
+
+         struct bitnet_tensor_extra * wt = src0->extra;
+         char * cur_wdata = wdata;
+-        bitnet_float_type * bitnet_f_ptr = wdata;
++        bitnet_float_type * bitnet_f_ptr = (bitnet_float_type *) wdata;
+         if (sizeof(bitnet_float_type) == 2) {
+             cur_wdata = wdata + MAX(ne10, ne01) * ne11 * sizeof(bitnet_float_type);
+         };
+@@ -12705,7 +12705,7 @@ static void ggml_compute_forward_mul_mat(
+             GGML_ASSERT(src1->type == GGML_TYPE_F32);
+             bitnet_float_type * act_input;
+             if (sizeof(bitnet_float_type) == 2) {
+-                ggml_fp32_to_fp16_row(src1->data, bitnet_f_ptr, ne10 * ne11);
++                ggml_fp32_to_fp16_row(src1->data, (ggml_fp16_t *)bitnet_f_ptr, ne10 * ne11);
+                 act_input = bitnet_f_ptr;
+             } else {
+                 act_input = src1->data;
+@@ -12760,9 +12760,9 @@ static void ggml_compute_forward_mul_mat(
+         // src1: activation, ne10 = k, ne11 = m
+         char * wdata = params->wdata;
+
+-        struct bitnet_tensor_extra * wt = src0->extra;
++        struct bitnet_tensor_extra * wt = (struct bitnet_tensor_extra *) src0->extra;
+         char * cur_wdata = wdata;
+-        bitnet_float_type * bitnet_f_ptr = wdata;
++        bitnet_float_type * bitnet_f_ptr = (bitnet_float_type *) wdata;
+         if (sizeof(bitnet_float_type) == 2) {
+             cur_wdata = wdata + MAX(ne10, ne01) * ne11 * sizeof(bitnet_float_type);
+         };
+@@ -12787,7 +12787,7 @@ static void ggml_compute_forward_mul_mat(
+             GGML_ASSERT(src1->type == GGML_TYPE_F32);
+             bitnet_float_type * act_input;
+             if (sizeof(bitnet_float_type) == 2) {
+-                ggml_fp32_to_fp16_row(src1->data, bitnet_f_ptr, ne10 * ne11);
++                ggml_fp32_to_fp16_row(src1->data, (ggml_fp16_t *)bitnet_f_ptr, ne10 * ne11);
+                 act_input = bitnet_f_ptr;
+             } else {
+                 act_input = src1->data;