From a0e0559797e1623867741c8ef60a40fdb3f74bf5 Mon Sep 17 00:00:00 2001 From: Vladimir Moushkov Date: Tue, 31 Mar 2026 14:39:10 +0000 Subject: [PATCH 1/2] Fix TL2 kernel batch output stride bug causing NaN for batch_size > 1 The generated TL2 LUT kernels (three_qgemm_lut_* and two_qgemm_lut_*) wrote all batch elements' results to C[i] instead of C[bs*out_stride+i], so each batch element overwrote the previous one. This caused NaN/garbage output for any multi-token prompt evaluation (batch_size > 1). Only single-token generation (batch_size=1) worked correctly. Fix: - Add `int out_stride` parameter to all generated kernel functions - Use `C[bs * out_stride + i]` indexing for output writes - Pass `m` (ne01) from ggml_qgemm_lut dispatcher as the stride - Add NaN guard in per_tensor_quant: prevent 127/0 division when max activation is zero - Add NaN guard in scale application: handle zero act_scales safely The nan-guards-ggml.patch contains corresponding defensive fixes for quantize_row_i8_s() in ggml-quants.c and the I2_S scale application paths in ggml.c (submodule: 3rdparty/llama.cpp). Co-Authored-By: Claude Opus 4.6 (1M context) --- nan-guards-ggml.patch | 63 +++++++++++++++++++++++++++++++++++++++++++ utils/codegen_tl2.py | 62 +++++++++++++++++++++--------------------- 2 files changed, 95 insertions(+), 30 deletions(-) create mode 100644 nan-guards-ggml.patch diff --git a/nan-guards-ggml.patch b/nan-guards-ggml.patch new file mode 100644 index 000000000..3627afde6 --- /dev/null +++ b/nan-guards-ggml.patch @@ -0,0 +1,63 @@ +diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c +index 127c6bcd..e0f1350d 100644 +--- a/ggml/src/ggml-quants.c ++++ b/ggml/src/ggml-quants.c +@@ -3508,7 +3508,7 @@ void quantize_row_i8_s(const float * x, void * y, int64_t n, float* act_scales, + for (int i = 0; i < n; ++i) { + max = MAX(max, (double)fabs((double)x[i])); + } +- float s = 127 / max; ++ float s = (max > 1e-10) ? (float)(127.0 / max) : 0.0f; + act_scales[0] = s; + int32_t sum = 0; + for (int i = 0; i < n; ++i) { +@@ -3530,7 +3530,7 @@ void quantize_row_i8_s_4x1(const float * x, void * y, int64_t n, float* act_scal + for (int i = 0; i < n; ++i) { + max = MAX(max, (double)fabs((double)x[i])); + } +- float s = 127 / max; ++ float s = (max > 1e-10) ? (float)(127.0 / max) : 0.0f; + act_scales[0] = s; + int32_t sum = 0; + for (int i = 0; i < n / ACT_K_PACK_SIZE; ++i) { +diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c +index 121f72da..5a5e68fc 100644 +--- a/ggml/src/ggml.c ++++ b/ggml/src/ggml.c +@@ -12510,7 +12510,8 @@ static void ggml_compute_forward_mul_mat_one_chunk( + + // post compute activation scaling + for (int row = 0; row < 16; row++) { +- tmp[row] = (tmp[row] - act_sums[i1]) / (act_scales[i1]) * (*scale); ++ float as = act_scales[i1]; ++ tmp[row] = (as != 0.0f) ? (tmp[row] - act_sums[i1]) / as * (*scale) : 0.0f; + } + } + else +@@ -12518,7 +12519,7 @@ static void ggml_compute_forward_mul_mat_one_chunk( + for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) { + if (src0->type == GGML_TYPE_I2_S) { + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0 * nb01 / 4, 0, src1_col_de, 0, 1); +- tmp[ir0 - iir0] = (tmp[ir0 - iir0] - act_sums[i1]) / (act_scales[i1]) * (*scale); ++ { float as = act_scales[i1]; tmp[ir0 - iir0] = (as != 0.0f) ? (tmp[ir0 - iir0] - act_sums[i1]) / as * (*scale) : 0.0f; } + } else { + vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot); + } +@@ -13266,7 +13267,7 @@ UseGgmlGemm2:; + (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); + for (int col = 0; col < ne11 - ne11 % 4; col++) { + for (int row = 0; row < src0_end - src0_start; row++) { +- tmp[col * (src0_end - src0_start) + row] = (tmp[col * (src0_end - src0_start) + row] - act_sums[col]) / (act_scales[col]) * (*scale); ++ { float as = act_scales[col]; tmp[col * (src0_end - src0_start) + row] = (as != 0.0f) ? (tmp[col * (src0_end - src0_start) + row] - act_sums[col]) / as * (*scale) : 0.0f; } + } + memcpy((float *)((char *) dst->data + (col * nb1)) + src0_start, tmp + col * (src0_end - src0_start), (src0_end - src0_start) * sizeof(float)); + } +@@ -13287,7 +13288,7 @@ UseGgmlGemm2:; + (const char *) src1_wdata + (src1_col_stride * iter), + 1, src0_end - src0_start); + for (int row = 0; row < src0_end - src0_start; row++) { +- tmp[row] = (tmp[row] - act_sums[iter]) / (act_scales[iter]) * (*scale); ++ { float as = act_scales[iter]; tmp[row] = (as != 0.0f) ? (tmp[row] - act_sums[iter]) / as * (*scale) : 0.0f; } + } + memcpy((float *)((char *) dst->data + (iter * nb1)) + src0_start, tmp, (src0_end - src0_start) * sizeof(float)); + } diff --git a/utils/codegen_tl2.py b/utils/codegen_tl2.py index 4d9408123..27f432f95 100644 --- a/utils/codegen_tl2.py +++ b/utils/codegen_tl2.py @@ -89,7 +89,9 @@ def gen_ctor_code(): __m128 max1 = _mm_max_ps(_mm256_extractf128_ps(max_vec, 1), _mm256_castps256_ps128(max_vec));\n\ max1 = _mm_max_ps(max1, _mm_movehl_ps(max1, max1));\n\ max1 = _mm_max_ss(max1, _mm_movehdup_ps(max1));\n\ - float scales = 127 / _mm_cvtss_f32(max1);\n\ + float max_val = _mm_cvtss_f32(max1);\n\ + if (max_val < 1e-10f) max_val = 1e-10f;\n\ + float scales = 127.0f / max_val;\n\ *lut_scales = scales;\n\ #endif\n\ return 0;\n\ @@ -490,7 +492,7 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list): }}\n\ \n\ template\n\ -int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\ +int32_t three_qgemm_lut_{0}(void* A, void* sign, void* LUT, void* Scales, void* LUT_Scales, void* C, int out_stride) {{\n\ alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\ memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\ #pragma unroll\n\ @@ -501,14 +503,14 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list): for (int bs = 0; bs < BATCH_SIZE; bs++) {{\n\ #pragma unroll\n\ for (int i = 0; i < BM{0}; i++) {{\n\ - ((int32_t*)C)[i] = (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\ + ((int32_t*)C)[bs * out_stride + i] = (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\ }}\n\ }}\n\ return 0;\n\ }}\n\ \n\ template\n\ -int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C) {{\n\ +int32_t two_qgemm_lut_{0}(void* A, void* LUT, void* Scales, void* LUT_Scales, void* C, int out_stride) {{\n\ alignas(32) uint32_t CBits[BATCH_SIZE * BM{0}];\n\ memset(&(CBits[0]), 0, BATCH_SIZE * BM{0} * sizeof(int32_t));\n\ #pragma unroll\n\ @@ -519,8 +521,8 @@ def gen_tbl_impl(pre, BM, BK, bm, k_list): for (int bs = 0; bs < BATCH_SIZE; bs++) {{\n\ #pragma unroll\n\ for (int i = 0; i < BM{0}; i++) {{\n\ - ((int32_t*)C)[i] += (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\ - ((float*)C)[i] = (float)(((int32_t*)C)[i]) / ((float*)LUT_Scales)[bs] * ((float*)Scales)[0];\n\ + ((int32_t*)C)[bs * out_stride + i] += (int32_t)(((int32_t*)CBits)[i + bs * BM{0}]);\n\ + {{ float ls = ((float*)LUT_Scales)[bs]; ((float*)C)[bs * out_stride + i] = (ls != 0.0f) ? (float)(((int32_t*)C)[bs * out_stride + i]) / ls * ((float*)Scales)[0] : 0.0f; }}\n\ }}\n\ }}\n\ return 0;\n\ @@ -556,32 +558,32 @@ def gen_top_api(kernel_shapes, k_list): if (m == {0} && k == {1}) {{\n\ if (BK == {2}) {{\n\ if (bs == 1) {{\n\ - two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 8) {{\n\ - two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 32) {{\n\ - two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 128) {{\n\ - two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 256) {{\n\ - two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 512) {{\n\ - two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C, m);\n\ }}\n\ }}\n\ else if (BK == {3}) {{\n\ if (bs == 1) {{\n\ - three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 8) {{\n\ - three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 32) {{\n\ - three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 128) {{\n\ - three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 256) {{\n\ - three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 512) {{\n\ - three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}\n\ }}\n\ }}\n\ @@ -590,32 +592,32 @@ def gen_top_api(kernel_shapes, k_list): kernel_code = "".join([kernel_code, " else if (m == {0} && k == {1}) {{\n\ if (BK == {2}) {{\n\ if (bs == 1) {{\n\ - two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<1>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 8) {{\n\ - two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<8>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 32) {{\n\ - two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<32>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 128) {{\n\ - two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<128>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 256) {{\n\ - two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<256>(A, LUT, Scales, LUT_Scales, C, m);\n\ }} else if (bs == 512) {{\n\ - two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C);\n\ + two_qgemm_lut_{4}<512>(A, LUT, Scales, LUT_Scales, C, m);\n\ }}\n\ }}\n\ else if (BK == {3}) {{\n\ if (bs == 1) {{\n\ - three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<1>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 8) {{\n\ - three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<8>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 32) {{\n\ - three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<32>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 128) {{\n\ - three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<128>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 256) {{\n\ - three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<256>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}else if (bs == 512) {{\n\ - three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C);\n\ + three_qgemm_lut_{4}<512>(A, sign, LUT, Scales, LUT_Scales, C, m);\n\ }}\n\ }}\n\ }}\n\ From 2198eabd96f6994dfd258488d30016a03677264d Mon Sep 17 00:00:00 2001 From: Vladimir Moushkov Date: Tue, 31 Mar 2026 14:39:55 +0000 Subject: [PATCH 2/2] Add Windows build script for Intel 12700K (AVX2) Build script for Windows x86_64 targeting Intel Alder Lake (12700K) with AVX2 support. Includes TL2 kernel generation, NaN patch application, and usage instructions for CLI, interactive, and server modes. Co-Authored-By: Claude Opus 4.6 (1M context) --- build_windows_12700k.bat | 122 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 build_windows_12700k.bat diff --git a/build_windows_12700k.bat b/build_windows_12700k.bat new file mode 100644 index 000000000..d63bbb51c --- /dev/null +++ b/build_windows_12700k.bat @@ -0,0 +1,122 @@ +@echo off +REM ============================================================ +REM Build bitnet.cpp for Windows x86_64 (Intel 12700K / AVX2) +REM ============================================================ +REM +REM Prerequisites: +REM 1. Visual Studio 2022 with "Desktop development with C++" workload +REM 2. LLVM/Clang >= 18: https://github.com/llvm/llvm-project/releases +REM 3. CMake >= 3.22: https://cmake.org/download/ +REM 4. Ninja: https://github.com/ninja-build/ninja/releases +REM 5. Python >= 3.9 with conda +REM +REM After installing VS 2022, run this from a "Developer Command Prompt for VS 2022" +REM or "x64 Native Tools Command Prompt for VS 2022" +REM +REM Usage: +REM build_windows_12700k.bat +REM ============================================================ + +echo. +echo === BitNet TL2 Build for Intel 12700K (AVX2) === +echo. + +REM Check prerequisites +where clang >nul 2>&1 +if errorlevel 1 ( + echo ERROR: clang not found. Install LLVM/Clang ^>= 18 and add to PATH. + echo Download: https://github.com/llvm/llvm-project/releases + exit /b 1 +) + +where cmake >nul 2>&1 +if errorlevel 1 ( + echo ERROR: cmake not found. Install CMake ^>= 3.22. + exit /b 1 +) + +where ninja >nul 2>&1 +if errorlevel 1 ( + echo WARNING: ninja not found, will use default generator. + echo For faster builds, install Ninja: https://github.com/ninja-build/ninja/releases + set GENERATOR=-G "Visual Studio 17 2022" -T ClangCL +) else ( + set GENERATOR=-G Ninja +) + +REM Initialize submodule if needed +if not exist "3rdparty\llama.cpp\CMakeLists.txt" ( + echo Initializing submodule... + git submodule update --init +) + +REM Apply the ggml NaN guard patch to the submodule +if exist "nan-guards-ggml.patch" ( + echo Applying NaN guard patch to llama.cpp submodule... + pushd 3rdparty\llama.cpp + git apply ..\..\nan-guards-ggml.patch 2>nul + popd +) + +REM Step 1: Generate TL2 kernels (uses Python) +echo. +echo === Step 1: Generating TL2 kernels === +python utils\codegen_tl2.py --model bitnet_b1_58-2B --BM 256 --BK 96 --bm 32 +if errorlevel 1 ( + echo WARNING: Kernel generation failed. Using existing kernels if available. +) + +REM Step 2: CMake configure +echo. +echo === Step 2: CMake Configure === +if not exist "build_win" mkdir build_win +cd build_win + +cmake .. %GENERATOR% ^ + -DCMAKE_C_COMPILER=clang ^ + -DCMAKE_CXX_COMPILER=clang++ ^ + -DGGML_BITNET_X86_TL2=ON ^ + -DCMAKE_BUILD_TYPE=Release + +if errorlevel 1 ( + echo ERROR: CMake configure failed. + cd .. + exit /b 1 +) + +REM Step 3: Build +echo. +echo === Step 3: Building === +cmake --build . --config Release -j %NUMBER_OF_PROCESSORS% + +if errorlevel 1 ( + echo ERROR: Build failed. + cd .. + exit /b 1 +) + +cd .. + +echo. +echo ============================================================ +echo BUILD COMPLETE +echo ============================================================ +echo. +echo Binaries are in: build_win\bin\Release\ (or build_win\bin\) +echo. +echo To run inference: +echo build_win\bin\llama-cli.exe -m PATH\TO\bitnet_v2_TL2.gguf ^ +echo -p "What is an SBOM?" -n 200 --temp 0.7 -t 8 +echo. +echo Interactive mode: +echo build_win\bin\llama-cli.exe -m PATH\TO\bitnet_v2_TL2.gguf ^ +echo -i -p "<|im_start|>assistant" ^ +echo --in-prefix "<|im_start|>user\n" ^ +echo --in-suffix "<|im_end|>\n<|im_start|>assistant\n" ^ +echo -n 512 --temp 0.7 --repeat-penalty 1.2 -t 8 ^ +echo -r "<|im_end|>" +echo. +echo Server mode (access from any device): +echo build_win\bin\llama-server.exe -m PATH\TO\bitnet_v2_TL2.gguf ^ +echo --host 0.0.0.0 --port 8080 -t 8 -c 4096 +echo.