From a0e0559797e1623867741c8ef60a40fdb3f74bf5 Mon Sep 17 00:00:00 2001
From: Vladimir Moushkov <vladimir@moushkov.info>
Date: Tue, 31 Mar 2026 14:39:10 +0000
Subject: [PATCH 1/2] Fix TL2 kernel batch output stride bug causing NaN for
 batch_size > 1

The generated TL2 LUT kernels (three_qgemm_lut_* and two_qgemm_lut_*)
wrote all batch elements' results to C[i] instead of C[bs*out_stride+i],
so each batch element overwrote the previous one. This caused NaN/garbage
output for any multi-token prompt evaluation (batch_size > 1). Only
single-token generation (batch_size=1) worked correctly.

Fix:
- Add `int out_stride` parameter to all generated kernel functions
- Use `C[bs * out_stride + i]` indexing for output writes
- Pass `m` (ne01) from ggml_qgemm_lut dispatcher as the stride
- Add NaN guard in per_tensor_quant: prevent 127/0 division when
  max activation is zero
- Add NaN guard in scale application: handle zero act_scales safely

The nan-guards-ggml.patch contains corresponding defensive fixes for
quantize_row_i8_s() in ggml-quants.c and the I2_S scale application
paths in ggml.c (submodule: 3rdparty/llama.cpp).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 nan-guards-ggml.patch | 63 +++++++++++++++++++++++++++++++++++++++++++
 utils/codegen_tl2.py  | 62 +++++++++++++++++++++---------------------
 2 files changed, 95 insertions(+), 30 deletions(-)
 create mode 100644 nan-guards-ggml.patch

diff --git a/nan-guards-ggml.patch b/nan-guards-ggml.patch
new file mode 100644
index 000000000..3627afde6
--- /dev/null
+++ b/nan-guards-ggml.patch
@@ -0,0 +1,63 @@
+diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
+index 127c6bcd..e0f1350d 100644
+--- a/ggml/src/ggml-quants.c
++++ b/ggml/src/ggml-quants.c
+@@ -3508,7 +3508,7 @@ void quantize_row_i8_s(const float * x, void * y, int64_t n, float* act_scales,
+     for (int i = 0; i < n; ++i) {
+         max = MAX(max, (double)fabs((double)x[i]));
+     }
+-    float s = 127 / max;
++    float s = (max > 1e-10) ? (float)(127.0 / max) : 0.0f;
+     act_scales[0] = s;
+     int32_t sum = 0;
+     for (int i = 0; i < n; ++i) {
+@@ -3530,7 +3530,7 @@ void quantize_row_i8_s_4x1(const float * x, void * y, int64_t n, float* act_scal
+     for (int i = 0; i < n; ++i) {
+         max = MAX(max, (double)fabs((double)x[i]));
+     }
+-    float s = 127 / max;
++    float s = (max > 1e-10) ? (float)(127.0 / max) : 0.0f;
+     act_scales[0] = s;
+     int32_t sum = 0;
+     for (int i = 0; i < n / ACT_K_PACK_SIZE; ++i) {
+diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
+index 121f72da..5a5e68fc 100644
+--- a/ggml/src/ggml.c
++++ b/ggml/src/ggml.c
+@@ -12510,7 +12510,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
+                     
+                     // post compute activation scaling
+                     for (int row = 0; row < 16; row++) {
+-                        tmp[row] = (tmp[row] - act_sums[i1]) / (act_scales[i1]) * (*scale);
++                        float as = act_scales[i1];
++                        tmp[row] = (as != 0.0f) ? (tmp[row] - act_sums[i1]) / as * (*scale) : 0.0f;
+                     }
+                 }
+                 else
+@@ -12518,7 +12519,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
+                     for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                         if (src0->type == GGML_TYPE_I2_S) {
+                             vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0 * nb01 / 4, 0, src1_col_de, 0, 1);
+-                            tmp[ir0 - iir0] = (tmp[ir0 - iir0]  - act_sums[i1]) / (act_scales[i1]) * (*scale);
++                            { float as = act_scales[i1]; tmp[ir0 - iir0] = (as != 0.0f) ? (tmp[ir0 - iir0] - act_sums[i1]) / as * (*scale) : 0.0f; }
+                         } else {
+                             vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                         }
+@@ -13266,7 +13267,7 @@ UseGgmlGemm2:;
+                     (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+                 for (int col = 0; col < ne11 - ne11 % 4; col++) {
+                     for (int row = 0; row < src0_end - src0_start; row++) {
+-                        tmp[col * (src0_end - src0_start) + row] = (tmp[col * (src0_end - src0_start) + row] - act_sums[col]) / (act_scales[col]) * (*scale);
++                        { float as = act_scales[col]; tmp[col * (src0_end - src0_start) + row] = (as != 0.0f) ? (tmp[col * (src0_end - src0_start) + row] - act_sums[col]) / as * (*scale) : 0.0f; }
+                     }
+                     memcpy((float *)((char *) dst->data + (col * nb1)) + src0_start, tmp + col * (src0_end - src0_start), (src0_end - src0_start) * sizeof(float));
+                 }
+@@ -13287,7 +13288,7 @@ UseGgmlGemm2:;
+                     (const char *) src1_wdata + (src1_col_stride * iter),
+                     1, src0_end - src0_start);
+                 for (int row = 0; row < src0_end - src0_start; row++) {
+-                    tmp[row] = (tmp[row] - act_sums[iter]) / (act_scales[iter]) * (*scale);
++                    { float as = act_scales[iter]; tmp[row] = (as != 0.0f) ? (tmp[row] - act_sums[iter]) / as * (*scale) : 0.0f; }
+                 }
+                 memcpy((float *)((char *) dst->data + (iter * nb1)) + src0_start, tmp, (src0_end - src0_start) * sizeof(float));
+             }
diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py
index 4d9408123..27f432f95 100644
--- a/utils/codegen_tl2.py
+++ b/utils/codegen_tl2.py
@@ -89,7 +89,9 @@ def gen_ctor_code():
     __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec));\n\
     max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1));\n\
     max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1));\n\
-    float scales = 127 / _mm_cvtss_f32(max1);\n\
+    float max_val = _mm_cvtss_f32(max1);\n\
+    if (max_val < 1e-10f) max_val = 1e-10f;\n\
+    float scales = 127.0f / max_val;\n\
     *lut_scales = scales;\n\
 #endif\n\
     return 0;\n\
@@ -490,7 +492,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list):
 }}\n\
 \n\
 template<int BATCH_SIZE>\n\
-int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
+int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C, int out_stride) {{\n\
     alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\
     memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\
 #pragma unroll\n\
@@ -501,14 +503,14 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list):
     for (int bs = 0; bs < BATCH_SIZE; bs++) {{\n\
 #pragma unroll\n\
         for (int i = 0; i < BM{0}; i++) {{\n\
-            ((int32_t*)C)[i] = (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
+            ((int32_t*)C)[bs * out_stride + i] = (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
         }}\n\
   }}\n\
   return 0;\n\
 }}\n\
 \n\
 template<int BATCH_SIZE>\n\
-int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\
+int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C, int out_stride) {{\n\
     alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\
     memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\
 #pragma unroll\n\
@@ -519,8 +521,8 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list):
     for (int bs = 0; bs < BATCH_SIZE; bs++) {{\n\
 #pragma unroll\n\
         for (int i = 0; i < BM{0}; i++) {{\n\
-            ((int32_t*)C)[i] += (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
-            ((float*)C)[i] = (float)(((int32_t*)C)[i]) / ((float*)LUT_Scales)[bs] * ((float*)Scales)[0];\n\
+            ((int32_t*)C)[bs * out_stride + i] += (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\
+            {{ float ls = ((float*)LUT_Scales)[bs]; ((float*)C)[bs * out_stride + i] = (ls != 0.0f) ? (float)(((int32_t*)C)[bs * out_stride + i]) / ls * ((float*)Scales)[0] : 0.0f; }}\n\
         }}\n\
     }}\n\
   return 0;\n\
@@ -556,32 +558,32 @@ def gen_top_api(kernel_shapes, k_list):
     if (m == {0} && k == {1}) {{\n\
         if (BK == {2}) {{\n\
             if (bs == 1) {{\n\
-                two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 8) {{\n\
-                two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 32) {{\n\
-                two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 128) {{\n\
-                two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 256) {{\n\
-                two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 512) {{\n\
-                two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }}\n\
         }}\n\
         else if (BK == {3}) {{\n\
             if (bs == 1) {{\n\
-                three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 8) {{\n\
-                three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 32) {{\n\
-                three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 128) {{\n\
-                three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 256) {{\n\
-                three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 512) {{\n\
-                three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}\n\
         }}\n\
     }}\n\
@@ -590,32 +592,32 @@ def gen_top_api(kernel_shapes, k_list):
         kernel_code = "".join([kernel_code, "    else if (m == {0} && k == {1}) {{\n\
         if (BK == {2}) {{\n\
             if (bs == 1) {{\n\
-                two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 8) {{\n\
-                two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 32) {{\n\
-                two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 128) {{\n\
-                two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 256) {{\n\
-                two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }} else if (bs == 512) {{\n\
-                two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C);\n\
+                two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C, m);\n\
             }}\n\
         }}\n\
         else if (BK == {3}) {{\n\
             if (bs == 1) {{\n\
-                three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 8) {{\n\
-                three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 32) {{\n\
-                three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 128) {{\n\
-                three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 256) {{\n\
-                three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}else if (bs == 512) {{\n\
-                three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C);\n\
+                three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\
             }}\n\
         }}\n\
     }}\n\

From 2198eabd96f6994dfd258488d30016a03677264d Mon Sep 17 00:00:00 2001
From: Vladimir Moushkov <vladimir@moushkov.info>
Date: Tue, 31 Mar 2026 14:39:55 +0000
Subject: [PATCH 2/2] Add Windows build script for Intel 12700K (AVX2)

Build script for Windows x86_64 targeting Intel Alder Lake (12700K)
with AVX2 support. Includes TL2 kernel generation, NaN patch application,
and usage instructions for CLI, interactive, and server modes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 build_windows_12700k.bat | 122 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 build_windows_12700k.bat

diff --git a/build_windows_12700k.bat b/build_windows_12700k.bat
new file mode 100644
index 000000000..d63bbb51c
--- /dev/null
+++ b/build_windows_12700k.bat
@@ -0,0 +1,122 @@
+@echo off
+REM ============================================================
+REM Build bitnet.cpp for Windows x86_64 (Intel 12700K / AVX2)
+REM ============================================================
+REM
+REM Prerequisites:
+REM   1. Visual Studio 2022 with "Desktop development with C++" workload
+REM   2. LLVM/Clang >= 18: https://github.com/llvm/llvm-project/releases
+REM   3. CMake >= 3.22: https://cmake.org/download/
+REM   4. Ninja: https://github.com/ninja-build/ninja/releases
+REM   5. Python >= 3.9 with conda
+REM
+REM After installing VS 2022, run this from a "Developer Command Prompt for VS 2022"
+REM or "x64 Native Tools Command Prompt for VS 2022"
+REM
+REM Usage:
+REM   build_windows_12700k.bat
+REM ============================================================
+
+echo.
+echo === BitNet TL2 Build for Intel 12700K (AVX2) ===
+echo.
+
+REM Check prerequisites
+where clang >nul 2>&1
+if errorlevel 1 (
+    echo ERROR: clang not found. Install LLVM/Clang ^>= 18 and add to PATH.
+    echo Download: https://github.com/llvm/llvm-project/releases
+    exit /b 1
+)
+
+where cmake >nul 2>&1
+if errorlevel 1 (
+    echo ERROR: cmake not found. Install CMake ^>= 3.22.
+    exit /b 1
+)
+
+where ninja >nul 2>&1
+if errorlevel 1 (
+    echo WARNING: ninja not found, will use default generator.
+    echo For faster builds, install Ninja: https://github.com/ninja-build/ninja/releases
+    set GENERATOR=-G "Visual Studio 17 2022" -T ClangCL
+) else (
+    set GENERATOR=-G Ninja
+)
+
+REM Initialize submodule if needed
+if not exist "3rdparty\llama.cpp\CMakeLists.txt" (
+    echo Initializing submodule...
+    git submodule update --init
+)
+
+REM Apply the ggml NaN guard patch to the submodule
+if exist "nan-guards-ggml.patch" (
+    echo Applying NaN guard patch to llama.cpp submodule...
+    pushd 3rdparty\llama.cpp
+    git apply ..\..\nan-guards-ggml.patch 2>nul
+    popd
+)
+
+REM Step 1: Generate TL2 kernels (uses Python)
+echo.
+echo === Step 1: Generating TL2 kernels ===
+python utils\codegen_tl2.py --model bitnet_b1_58-2B --BM 256 --BK 96 --bm 32
+if errorlevel 1 (
+    echo WARNING: Kernel generation failed. Using existing kernels if available.
+)
+
+REM Step 2: CMake configure
+echo.
+echo === Step 2: CMake Configure ===
+if not exist "build_win" mkdir build_win
+cd build_win
+
+cmake .. %GENERATOR% ^
+    -DCMAKE_C_COMPILER=clang ^
+    -DCMAKE_CXX_COMPILER=clang++ ^
+    -DGGML_BITNET_X86_TL2=ON ^
+    -DCMAKE_BUILD_TYPE=Release
+
+if errorlevel 1 (
+    echo ERROR: CMake configure failed.
+    cd ..
+    exit /b 1
+)
+
+REM Step 3: Build
+echo.
+echo === Step 3: Building ===
+cmake --build . --config Release -j %NUMBER_OF_PROCESSORS%
+
+if errorlevel 1 (
+    echo ERROR: Build failed.
+    cd ..
+    exit /b 1
+)
+
+cd ..
+
+echo.
+echo ============================================================
+echo BUILD COMPLETE
+echo ============================================================
+echo.
+echo Binaries are in: build_win\bin\Release\ (or build_win\bin\)
+echo.
+echo To run inference:
+echo   build_win\bin\llama-cli.exe -m PATH\TO\bitnet_v2_TL2.gguf ^
+echo       -p "What is an SBOM?" -n 200 --temp 0.7 -t 8
+echo.
+echo Interactive mode:
+echo   build_win\bin\llama-cli.exe -m PATH\TO\bitnet_v2_TL2.gguf ^
+echo       -i -p "<|im_start|>assistant" ^
+echo       --in-prefix "<|im_start|>user\n" ^
+echo       --in-suffix "<|im_end|>\n<|im_start|>assistant\n" ^
+echo       -n 512 --temp 0.7 --repeat-penalty 1.2 -t 8 ^
+echo       -r "<|im_end|>"
+echo.
+echo Server mode (access from any device):
+echo   build_win\bin\llama-server.exe -m PATH\TO\bitnet_v2_TL2.gguf ^
+echo       --host 0.0.0.0 --port 8080 -t 8 -c 4096
+echo.