Kawrakow ikawrakow commited on
Commit
8a399ab
·
unverified ·
1 Parent(s): c34dd82

2-bit quantizations (llama/4897)

Browse files

* imatrix: load

* imatrix: WIP

* imatrix: Add Q2_K quantization

* imatrix: also guard against Q2_K_S quantization without importance matrix

* imatrix: guard even more against low-bit quantization misuse

---------

Co-authored-by: Iwan Kawrakow <[email protected]>

Files changed (4) hide show
  1. ggml-quants.c +900 -50
  2. ggml-quants.h +8 -4
  3. ggml.c +25 -11
  4. ggml.h +6 -3
ggml-quants.c CHANGED
@@ -5,6 +5,8 @@
5
  #include <string.h>
6
  #include <assert.h>
7
  #include <float.h>
 
 
8
 
9
  #ifdef __ARM_NEON
10
 
@@ -1639,6 +1641,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
1639
  return (n/QK_K*sizeof(block_q2_K));
1640
  }
1641
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1642
  //========================= 3-bit (de)-quantization
1643
 
1644
  void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
@@ -2584,14 +2821,6 @@ static const uint8_t ksigns_iq2xs[128] = {
2584
 
2585
  static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2586
 
2587
- void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
2588
- (void)x;
2589
- (void)y;
2590
- (void)k;
2591
- assert(k % QK_K == 0);
2592
- //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2593
- }
2594
-
2595
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
2596
  assert(k % QK_K == 0);
2597
  const int nb = k / QK_K;
@@ -2618,33 +2847,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
2618
  }
2619
  }
2620
 
2621
- void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
2622
- assert(k % QK_K == 0);
2623
- block_iq2_xxs * restrict y = vy;
2624
- quantize_row_iq2_xxs_reference(x, y, k);
2625
- }
2626
-
2627
- size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
2628
- assert(k % QK_K == 0);
2629
- (void)hist; // TODO: collect histograms
2630
-
2631
- for (int j = 0; j < n; j += k) {
2632
- block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
2633
- quantize_row_iq2_xxs_reference(src + j, y, k);
2634
- }
2635
- return (n/QK_K*sizeof(block_iq2_xxs));
2636
- }
2637
-
2638
  // ====================== 2.3125 bpw (de)-quantization
2639
 
2640
- void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
2641
- (void)x;
2642
- (void)y;
2643
- (void)k;
2644
- assert(k % QK_K == 0);
2645
- //fprintf(stderr, "=========================== %s: not implemented\n", __func__);
2646
- }
2647
-
2648
  void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
2649
  assert(k % QK_K == 0);
2650
  const int nb = k / QK_K;
@@ -2670,23 +2874,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
2670
  }
2671
  }
2672
 
2673
- void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
2674
- assert(k % QK_K == 0);
2675
- block_iq2_xs * restrict y = vy;
2676
- quantize_row_iq2_xs_reference(x, y, k);
2677
- }
2678
-
2679
- size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
2680
- assert(k % QK_K == 0);
2681
- (void)hist; // TODO: collect histograms
2682
-
2683
- for (int j = 0; j < n; j += k) {
2684
- block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
2685
- quantize_row_iq2_xs_reference(src + j, y, k);
2686
- }
2687
- return (n/QK_K*sizeof(block_iq2_xs));
2688
- }
2689
-
2690
  //===================================== Q8_K ==============================================
2691
 
2692
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
@@ -7730,3 +7917,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
7730
  *s = 0.125f * sumf;
7731
  #endif
7732
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  #include <string.h>
6
  #include <assert.h>
7
  #include <float.h>
8
+ #include <stdlib.h> // for qsort
9
+ #include <stdio.h> // for GGML_ASSERT
10
 
11
  #ifdef __ARM_NEON
12
 
 
1641
  return (n/QK_K*sizeof(block_q2_K));
1642
  }
1643
 
1644
+ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1645
+ uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1646
+ float rmin, float rdelta, int nstep, bool use_mad) {
1647
+ float min = x[0];
1648
+ float max = x[0];
1649
+ float sum_w = weights ? weights[0] : x[0]*x[0];
1650
+ float sum_x = sum_w * x[0];
1651
+ for (int i = 1; i < n; ++i) {
1652
+ if (x[i] < min) min = x[i];
1653
+ if (x[i] > max) max = x[i];
1654
+ float w = weights ? weights[i] : x[i]*x[i];
1655
+ sum_w += w;
1656
+ sum_x += w * x[i];
1657
+ }
1658
+ if (min > 0) {
1659
+ min = 0;
1660
+ }
1661
+ if (max <= min) {
1662
+ for (int i = 0; i < n; ++i) L[i] = 0;
1663
+ *the_min = -min;
1664
+ return 0.f;
1665
+ }
1666
+ float iscale = nmax/(max - min);
1667
+ float scale = 1/iscale;
1668
+ float best_mad = 0;
1669
+ for (int i = 0; i < n; ++i) {
1670
+ int l = nearest_int(iscale*(x[i] - min));
1671
+ L[i] = MAX(0, MIN(nmax, l));
1672
+ float diff = scale * L[i] + min - x[i];
1673
+ diff = use_mad ? fabsf(diff) : diff*diff;
1674
+ float w = weights ? weights[i] : x[i]*x[i];
1675
+ best_mad += w * diff;
1676
+ }
1677
+ if (nstep < 1) {
1678
+ *the_min = -min;
1679
+ return scale;
1680
+ }
1681
+ for (int is = 0; is <= nstep; ++is) {
1682
+ iscale = (rmin + rdelta*is + nmax)/(max - min);
1683
+ float sum_l = 0, sum_l2 = 0, sum_xl = 0;
1684
+ for (int i = 0; i < n; ++i) {
1685
+ int l = nearest_int(iscale*(x[i] - min));
1686
+ l = MAX(0, MIN(nmax, l));
1687
+ Laux[i] = l;
1688
+ float w = weights ? weights[i] : x[i]*x[i];
1689
+ sum_l += w*l;
1690
+ sum_l2 += w*l*l;
1691
+ sum_xl += w*l*x[i];
1692
+ }
1693
+ float D = sum_w * sum_l2 - sum_l * sum_l;
1694
+ if (D > 0) {
1695
+ float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
1696
+ float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
1697
+ if (this_min > 0) {
1698
+ this_min = 0;
1699
+ this_scale = sum_xl / sum_l2;
1700
+ }
1701
+ float mad = 0;
1702
+ for (int i = 0; i < n; ++i) {
1703
+ float diff = this_scale * Laux[i] + this_min - x[i];
1704
+ diff = use_mad ? fabsf(diff) : diff*diff;
1705
+ float w = weights ? weights[i] : x[i]*x[i];
1706
+ mad += w * diff;
1707
+ }
1708
+ if (mad < best_mad) {
1709
+ for (int i = 0; i < n; ++i) {
1710
+ L[i] = Laux[i];
1711
+ }
1712
+ best_mad = mad;
1713
+ scale = this_scale;
1714
+ min = this_min;
1715
+ }
1716
+ }
1717
+ }
1718
+ *the_min = -min;
1719
+ return scale;
1720
+ }
1721
+
1722
+ static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
1723
+ float max = 0;
1724
+ for (int i = 0; i < n; ++i) {
1725
+ max = MAX(max, x[i]);
1726
+ }
1727
+ if (!max) { // all zero
1728
+ for (int i = 0; i < n; ++i) { L[i] = 0; }
1729
+ return 0.f;
1730
+ }
1731
+ float iscale = nmax / max;
1732
+ for (int i = 0; i < n; ++i) {
1733
+ L[i] = nearest_int(iscale * x[i]);
1734
+ }
1735
+ float scale = 1/iscale;
1736
+ float best_mse = 0;
1737
+ for (int i = 0; i < n; ++i) {
1738
+ float diff = x[i] - scale*L[i];
1739
+ float w = quant_weights[i];
1740
+ best_mse += w*diff*diff;
1741
+ }
1742
+ for (int is = -4; is <= 4; ++is) {
1743
+ if (is == 0) continue;
1744
+ float iscale_is = (0.1f*is + nmax)/max;
1745
+ float scale_is = 1/iscale_is;
1746
+ float mse = 0;
1747
+ for (int i = 0; i < n; ++i) {
1748
+ int l = nearest_int(iscale_is*x[i]);
1749
+ l = MIN(nmax, l);
1750
+ float diff = x[i] - scale_is*l;
1751
+ float w = quant_weights[i];
1752
+ mse += w*diff*diff;
1753
+ }
1754
+ if (mse < best_mse) {
1755
+ best_mse = mse;
1756
+ iscale = iscale_is;
1757
+ }
1758
+ }
1759
+ float sumlx = 0;
1760
+ float suml2 = 0;
1761
+ for (int i = 0; i < n; ++i) {
1762
+ int l = nearest_int(iscale * x[i]);
1763
+ l = MIN(nmax, l);
1764
+ L[i] = l;
1765
+ float w = quant_weights[i];
1766
+ sumlx += w*x[i]*l;
1767
+ suml2 += w*l*l;
1768
+ }
1769
+ for (int itry = 0; itry < 5; ++itry) {
1770
+ int n_changed = 0;
1771
+ for (int i = 0; i < n; ++i) {
1772
+ float w = quant_weights[i];
1773
+ float slx = sumlx - w*x[i]*L[i];
1774
+ float sl2 = suml2 - w*L[i]*L[i];
1775
+ if (slx > 0 && sl2 > 0) {
1776
+ int new_l = nearest_int(x[i] * sl2 / slx);
1777
+ new_l = MIN(nmax, new_l);
1778
+ if (new_l != L[i]) {
1779
+ slx += w*x[i]*new_l;
1780
+ sl2 += w*new_l*new_l;
1781
+ if (slx*slx*suml2 > sumlx*sumlx*sl2) {
1782
+ L[i] = new_l; sumlx = slx; suml2 = sl2;
1783
+ ++n_changed;
1784
+ }
1785
+ }
1786
+ }
1787
+ }
1788
+ if (!n_changed) {
1789
+ break;
1790
+ }
1791
+ }
1792
+ return sumlx / suml2;
1793
+ }
1794
+
1795
+ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
1796
+ GGML_ASSERT(quant_weights);
1797
+ assert(k % QK_K == 0);
1798
+ const int nb = k / QK_K;
1799
+ const bool requantize = true;
1800
+
1801
+ uint8_t L[QK_K];
1802
+ uint8_t Laux[16];
1803
+ float mins[QK_K/16];
1804
+ float scales[QK_K/16];
1805
+ float sw[QK_K/16];
1806
+ float weight[QK_K/16];
1807
+ uint8_t Ls[QK_K/16], Lm[QK_K/16];
1808
+
1809
+ for (int i = 0; i < nb; i++) {
1810
+ memset(sw, 0, QK_K/16*sizeof(float));
1811
+ float sumx2 = 0;
1812
+ for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
1813
+ float sigma2 = sumx2/QK_K;
1814
+ for (int j = 0; j < QK_K/16; ++j) {
1815
+ const float * restrict qw = quant_weights + QK_K * i + 16*j;
1816
+ for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
1817
+ for (int l = 0; l < 16; ++l) sw[j] += weight[l];
1818
+ scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
1819
+ }
1820
+
1821
+ float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
1822
+ float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
1823
+ y[i].d = GGML_FP32_TO_FP16(dm);
1824
+ y[i].dmin = GGML_FP32_TO_FP16(mm);
1825
+ dm = GGML_FP16_TO_FP32(y[i].d);
1826
+ mm = GGML_FP16_TO_FP32(y[i].dmin);
1827
+
1828
+ for (int j = 0; j < QK_K/16; ++j) {
1829
+ y[i].scales[j] = Ls[j] | (Lm[j] << 4);
1830
+ }
1831
+
1832
+ if (requantize) {
1833
+ for (int j = 0; j < QK_K/16; ++j) {
1834
+ const float d = dm * (y[i].scales[j] & 0xF);
1835
+ if (!d) continue;
1836
+ const float m = mm * (y[i].scales[j] >> 4);
1837
+ for (int ii = 0; ii < 16; ++ii) {
1838
+ int l = nearest_int((x[16*j + ii] + m)/d);
1839
+ l = MAX(0, MIN(3, l));
1840
+ L[16*j + ii] = l;
1841
+ }
1842
+ }
1843
+ }
1844
+
1845
+ #if QK_K == 256
1846
+ for (int j = 0; j < QK_K; j += 128) {
1847
+ for (int l = 0; l < 32; ++l) {
1848
+ y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
1849
+ }
1850
+ }
1851
+ #else
1852
+ for (int l = 0; l < 16; ++l) {
1853
+ y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
1854
+ }
1855
+ #endif
1856
+
1857
+ x += QK_K;
1858
+
1859
+ }
1860
+ }
1861
+
1862
+ size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1863
+ (void)hist;
1864
+ int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1865
+ if (!quant_weights) {
1866
+ quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
1867
+ }
1868
+ else {
1869
+ char * qrow = (char *)dst;
1870
+ for (int row = 0; row < nrow; ++row) {
1871
+ quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
1872
+ src += n_per_row;
1873
+ qrow += row_size;
1874
+ }
1875
+ }
1876
+ return nrow * row_size;
1877
+ }
1878
+
1879
  //========================= 3-bit (de)-quantization
1880
 
1881
  void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
 
2821
 
2822
  static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
2823
 
 
 
 
 
 
 
 
 
2824
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
2825
  assert(k % QK_K == 0);
2826
  const int nb = k / QK_K;
 
2847
  }
2848
  }
2849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2850
  // ====================== 2.3125 bpw (de)-quantization
2851
 
 
 
 
 
 
 
 
 
2852
  void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
2853
  assert(k % QK_K == 0);
2854
  const int nb = k / QK_K;
 
2874
  }
2875
  }
2876
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2877
  //===================================== Q8_K ==============================================
2878
 
2879
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
 
7917
  *s = 0.125f * sumf;
7918
  #endif
7919
  }
7920
+
7921
+ // ================================ IQ2 quantization =============================================
7922
+
7923
+ typedef struct {
7924
+ uint64_t * grid;
7925
+ int * map;
7926
+ uint16_t * neighbours;
7927
+ } iq2_entry_t;
7928
+
7929
+ static iq2_entry_t iq2_data[2] = {
7930
+ {NULL, NULL, NULL},
7931
+ {NULL, NULL, NULL},
7932
+ };
7933
+
7934
+ static inline int iq2_data_index(int grid_size) {
7935
+ GGML_ASSERT(grid_size == 256 || grid_size == 512);
7936
+ return grid_size == 256 ? 0 : 1;
7937
+ }
7938
+
7939
+ static int iq2_compare_func(const void * left, const void * right) {
7940
+ const int * l = (const int *)left;
7941
+ const int * r = (const int *)right;
7942
+ return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
7943
+ }
7944
+
7945
+ static void q2xs_init_impl(int grid_size) {
7946
+ const int gindex = iq2_data_index(grid_size);
7947
+ if (iq2_data[gindex].grid) {
7948
+ return;
7949
+ }
7950
+ static const uint16_t kgrid_256[256] = {
7951
+ 0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
7952
+ 100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
7953
+ 1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
7954
+ 1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
7955
+ 2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
7956
+ 4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
7957
+ 5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
7958
+ 8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
7959
+ 10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
7960
+ 16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
7961
+ 17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
7962
+ 20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
7963
+ 22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
7964
+ 25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
7965
+ 33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
7966
+ 37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
7967
+ };
7968
+ static const uint16_t kgrid_512[512] = {
7969
+ 0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
7970
+ 73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
7971
+ 260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
7972
+ 352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
7973
+ 640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
7974
+ 1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
7975
+ 1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
7976
+ 2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
7977
+ 2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
7978
+ 4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
7979
+ 4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
7980
+ 5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
7981
+ 5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
7982
+ 8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
7983
+ 8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
7984
+ 10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
7985
+ 16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
7986
+ 16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
7987
+ 16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
7988
+ 17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
7989
+ 18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
7990
+ 20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
7991
+ 21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
7992
+ 22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
7993
+ 24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
7994
+ 32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
7995
+ 33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
7996
+ 33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
7997
+ 35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
7998
+ 37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
7999
+ 40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
8000
+ 42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
8001
+ };
8002
+ const int kmap_size = 43692;
8003
+ const int nwant = 2;
8004
+ const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
8005
+ uint64_t * kgrid_q2xs;
8006
+ int * kmap_q2xs;
8007
+ uint16_t * kneighbors_q2xs;
8008
+
8009
+ printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
8010
+ uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
8011
+ for (int k = 0; k < grid_size; ++k) {
8012
+ int8_t * pos = (int8_t *)(the_grid + k);
8013
+ for (int i = 0; i < 8; ++i) {
8014
+ int l = (kgrid[k] >> 2*i) & 0x3;
8015
+ pos[i] = 2*l + 1;
8016
+ }
8017
+ }
8018
+ kgrid_q2xs = the_grid;
8019
+ iq2_data[gindex].grid = the_grid;
8020
+ kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
8021
+ iq2_data[gindex].map = kmap_q2xs;
8022
+ for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
8023
+ uint64_t aux64;
8024
+ uint8_t * aux8 = (uint8_t *)&aux64;
8025
+ for (int i = 0; i < grid_size; ++i) {
8026
+ aux64 = kgrid_q2xs[i];
8027
+ uint16_t index = 0;
8028
+ for (int k=0; k<8; ++k) {
8029
+ uint16_t q = (aux8[k] - 1)/2;
8030
+ index |= (q << 2*k);
8031
+ }
8032
+ kmap_q2xs[index] = i;
8033
+ }
8034
+ int8_t pos[8];
8035
+ int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
8036
+ int num_neighbors = 0, num_not_in_map = 0;
8037
+ for (int i = 0; i < kmap_size; ++i) {
8038
+ if (kmap_q2xs[i] >= 0) continue;
8039
+ ++num_not_in_map;
8040
+ for (int k = 0; k < 8; ++k) {
8041
+ int l = (i >> 2*k) & 0x3;
8042
+ pos[k] = 2*l + 1;
8043
+ }
8044
+ for (int j = 0; j < grid_size; ++j) {
8045
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
8046
+ int d2 = 0;
8047
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
8048
+ dist2[2*j+0] = d2;
8049
+ dist2[2*j+1] = j;
8050
+ }
8051
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
8052
+ int n = 0; int d2 = dist2[0];
8053
+ int nhave = 1;
8054
+ for (int j = 0; j < grid_size; ++j) {
8055
+ if (dist2[2*j] > d2) {
8056
+ if (nhave == nwant) break;
8057
+ d2 = dist2[2*j];
8058
+ ++nhave;
8059
+ }
8060
+ ++n;
8061
+ }
8062
+ num_neighbors += n;
8063
+ }
8064
+ printf("%s: %d neighbours in total\n", __func__, num_neighbors);
8065
+ kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
8066
+ iq2_data[gindex].neighbours = kneighbors_q2xs;
8067
+ int counter = 0;
8068
+ for (int i = 0; i < kmap_size; ++i) {
8069
+ if (kmap_q2xs[i] >= 0) continue;
8070
+ for (int k = 0; k < 8; ++k) {
8071
+ int l = (i >> 2*k) & 0x3;
8072
+ pos[k] = 2*l + 1;
8073
+ }
8074
+ for (int j = 0; j < grid_size; ++j) {
8075
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
8076
+ int d2 = 0;
8077
+ for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
8078
+ dist2[2*j+0] = d2;
8079
+ dist2[2*j+1] = j;
8080
+ }
8081
+ qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
8082
+ kmap_q2xs[i] = -(counter + 1);
8083
+ int d2 = dist2[0];
8084
+ uint16_t * start = &kneighbors_q2xs[counter++];
8085
+ int n = 0, nhave = 1;
8086
+ for (int j = 0; j < grid_size; ++j) {
8087
+ if (dist2[2*j] > d2) {
8088
+ if (nhave == nwant) break;
8089
+ d2 = dist2[2*j];
8090
+ ++nhave;
8091
+ }
8092
+ kneighbors_q2xs[counter++] = dist2[2*j+1];
8093
+ ++n;
8094
+ }
8095
+ *start = n;
8096
+ }
8097
+ free(dist2);
8098
+ }
8099
+
8100
+ void ggml_init_iq2_quantization(enum ggml_type type) {
8101
+ if (type == GGML_TYPE_IQ2_XXS) {
8102
+ q2xs_init_impl(256);
8103
+ }
8104
+ else if (type == GGML_TYPE_IQ2_XS) {
8105
+ q2xs_init_impl(512);
8106
+ }
8107
+ else {
8108
+ fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8109
+ }
8110
+ }
8111
+
8112
+ static void q2xs_deinit_impl(int grid_size) {
8113
+ GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
8114
+ const int gindex = iq2_data_index(grid_size);
8115
+ if (iq2_data[gindex].grid) {
8116
+ free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
8117
+ free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
8118
+ free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
8119
+ }
8120
+ }
8121
+
8122
+ void ggml_deinit_iq2_quantization(enum ggml_type type) {
8123
+ if (type == GGML_TYPE_IQ2_XXS) {
8124
+ q2xs_deinit_impl(256);
8125
+ }
8126
+ else if (type == GGML_TYPE_IQ2_XS) {
8127
+ q2xs_deinit_impl(512);
8128
+ }
8129
+ else {
8130
+ fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
8131
+ }
8132
+ }
8133
+
8134
+ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
8135
+ const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
8136
+ int num_neighbors = neighbours[0];
8137
+ GGML_ASSERT(num_neighbors > 0);
8138
+ float best_d2 = FLT_MAX;
8139
+ int grid_index = -1;
8140
+ for (int j = 1; j <= num_neighbors; ++j) {
8141
+ const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
8142
+ float d2 = 0;
8143
+ for (int i = 0; i < 8; ++i) {
8144
+ float q = pg[i];
8145
+ float diff = scale*q - xval[i];
8146
+ d2 += weight[i]*diff*diff;
8147
+ }
8148
+ if (d2 < best_d2) {
8149
+ best_d2 = d2; grid_index = neighbours[j];
8150
+ }
8151
+ }
8152
+ GGML_ASSERT(grid_index >= 0);
8153
+ const int8_t * pg = (const int8_t *)(grid + grid_index);
8154
+ for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
8155
+ return grid_index;
8156
+ }
8157
+
8158
+ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
8159
+
8160
+ const int gindex = iq2_data_index(256);
8161
+
8162
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
8163
+ const int * kmap_q2xs = iq2_data[gindex].map;
8164
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8165
+
8166
+ GGML_ASSERT(quant_weights);
8167
+ GGML_ASSERT(kgrid_q2xs);
8168
+ GGML_ASSERT(kmap_q2xs);
8169
+ GGML_ASSERT(kneighbors_q2xs);
8170
+ GGML_ASSERT(n%QK_K == 0);
8171
+
8172
+ const int kMaxQ = 3;
8173
+
8174
+ const int nbl = n/256;
8175
+
8176
+ block_iq2_xxs * y = vy;
8177
+
8178
+ float scales[QK_K/32];
8179
+ float weight[32];
8180
+ float xval[32];
8181
+ int8_t L[32];
8182
+ int8_t Laux[32];
8183
+ float waux[32];
8184
+ bool is_on_grid[4];
8185
+ bool is_on_grid_aux[4];
8186
+ uint8_t block_signs[4];
8187
+ uint32_t q2[2*(QK_K/32)];
8188
+
8189
+ for (int ibl = 0; ibl < nbl; ++ibl) {
8190
+
8191
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
8192
+ memset(q2, 0, QK_K/4);
8193
+
8194
+ float max_scale = 0;
8195
+
8196
+ const float * xbl = x + QK_K*ibl;
8197
+ float sumx2 = 0;
8198
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
8199
+ float sigma2 = sumx2/QK_K;
8200
+
8201
+ for (int ib = 0; ib < QK_K/32; ++ib) {
8202
+ const float * xb = xbl + 32*ib;
8203
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
8204
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8205
+ for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
8206
+ for (int k = 0; k < 4; ++k) {
8207
+ int nflip = 0;
8208
+ uint8_t s = 0;
8209
+ for (int i = 0; i < 8; ++i) {
8210
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
8211
+ else {
8212
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
8213
+ }
8214
+ }
8215
+ if (nflip%2) {
8216
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
8217
+ for (int i = 1; i < 8; ++i) {
8218
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
8219
+ if (ax < min) {
8220
+ min = ax; imin = i;
8221
+ }
8222
+ }
8223
+ xval[8*k+imin] = -xval[8*k+imin];
8224
+ s ^= (1 << imin);
8225
+ }
8226
+ block_signs[k] = s & 127;
8227
+ }
8228
+ float max = xval[0];
8229
+ for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
8230
+ if (!max) {
8231
+ scales[ib] = 0;
8232
+ memset(L, 0, 32);
8233
+ continue;
8234
+ }
8235
+ float best = 0;
8236
+ float scale = max/(2*kMaxQ-1);
8237
+ for (int is = -9; is <= 9; ++is) {
8238
+ float id = (2*kMaxQ-1+is*0.1f)/max;
8239
+ float this_scale = 1/id;
8240
+ for (int k = 0; k < 4; ++k) {
8241
+ for (int i = 0; i < 8; ++i) {
8242
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8243
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
8244
+ }
8245
+ uint16_t u = 0;
8246
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
8247
+ int grid_index = kmap_q2xs[u];
8248
+ is_on_grid_aux[k] = true;
8249
+ if (grid_index < 0) {
8250
+ is_on_grid_aux[k] = false;
8251
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8252
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
8253
+ }
8254
+ }
8255
+ float sumqx = 0, sumq2 = 0;
8256
+ for (int i = 0; i < 32; ++i) {
8257
+ float w = weight[i];
8258
+ float q = 2*Laux[i] + 1;
8259
+ sumqx += w*xval[i]*q;
8260
+ sumq2 += w*q*q;
8261
+ }
8262
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
8263
+ scale = sumqx/sumq2; best = scale*sumqx;
8264
+ for (int i = 0; i < 32; ++i) L[i] = Laux[i];
8265
+ for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
8266
+ }
8267
+ }
8268
+ int n_not_ongrid = 0;
8269
+ for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
8270
+ if (n_not_ongrid > 0 && scale > 0) {
8271
+ float id = 1/scale;
8272
+ for (int k = 0; k < 4; ++k) {
8273
+ if (is_on_grid[k]) continue;
8274
+ uint16_t u = 0;
8275
+ for (int i = 0; i < 8; ++i) {
8276
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8277
+ l = MAX(0, MIN(kMaxQ-1, l));
8278
+ u |= (l << 2*i);
8279
+ }
8280
+ int grid_index = kmap_q2xs[u];
8281
+ if (grid_index < 0) {
8282
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8283
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
8284
+ }
8285
+ const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
8286
+ for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
8287
+ }
8288
+ float sumqx = 0, sumq2 = 0;
8289
+ for (int i = 0; i < 32; ++i) {
8290
+ float w = weight[i];
8291
+ float q = 2*L[i] + 1;
8292
+ sumqx += w*xval[i]*q;
8293
+ sumq2 += w*q*q;
8294
+ }
8295
+ if (sumq2 > 0) scale = sumqx/sumq2;
8296
+ }
8297
+ if (scale < 0) {
8298
+ // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
8299
+ // and correspondingly flip quant signs.
8300
+ scale = -scale;
8301
+ for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
8302
+ }
8303
+ for (int k = 0; k < 4; ++k) {
8304
+ uint16_t u = 0;
8305
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
8306
+ int grid_index = kmap_q2xs[u];
8307
+ if (grid_index < 0) {
8308
+ printf("Oops: found point %u not on grid:", u);
8309
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
8310
+ printf("\n");
8311
+ GGML_ASSERT(false);
8312
+ }
8313
+ q2[2*ib+0] |= (grid_index << 8*k);
8314
+ q2[2*ib+1] |= (block_signs[k] << 7*k);
8315
+ }
8316
+ GGML_ASSERT(scale >= 0);
8317
+ scales[ib] = scale;
8318
+ max_scale = MAX(max_scale, scale);
8319
+ }
8320
+
8321
+ if (!max_scale) {
8322
+ memset(y[ibl].qs, 0, QK_K/4);
8323
+ continue;
8324
+ }
8325
+
8326
+ float d = max_scale/31;
8327
+ y[ibl].d = GGML_FP32_TO_FP16(d);
8328
+ float id = 1/d;
8329
+ float sumqx = 0, sumq2 = 0;
8330
+ for (int ib = 0; ib < QK_K/32; ++ib) {
8331
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
8332
+ l = MAX(0, MIN(15, l));
8333
+ q2[2*ib+1] |= ((uint32_t)l << 28);
8334
+ const float * xb = xbl + 32*ib;
8335
+ const float * qw = quant_weights + QK_K*ibl + 32*ib;
8336
+ for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8337
+ const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
8338
+ const float db = d * (1 + 2*l);
8339
+ uint32_t u = 0;
8340
+ for (int k = 0; k < 4; ++k) {
8341
+ const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
8342
+ const float * xk = xb + 8*k;
8343
+ const float * wk = weight + 8*k;
8344
+ const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
8345
+ float best_mse = 0; int best_index = aux8[k];
8346
+ for (int j = 0; j < 8; ++j) {
8347
+ float diff = db * grid[j] * signs[j] - xk[j];
8348
+ best_mse += wk[j] * diff * diff;
8349
+ }
8350
+ for (int idx = 0; idx < 256; ++idx) {
8351
+ grid = (const uint8_t *)(kgrid_q2xs + idx);
8352
+ float mse = 0;
8353
+ for (int j = 0; j < 8; ++j) {
8354
+ float diff = db * grid[j] * signs[j] - xk[j];
8355
+ mse += wk[j] * diff * diff;
8356
+ }
8357
+ if (mse < best_mse) {
8358
+ best_mse = mse; best_index = idx;
8359
+ }
8360
+ }
8361
+ u |= (best_index << 8*k);
8362
+ grid = (const uint8_t *)(kgrid_q2xs + best_index);
8363
+ //grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
8364
+ for (int j = 0; j < 8; ++j) {
8365
+ float q = db * grid[j] * signs[j];
8366
+ sumqx += wk[j] * q * xk[j];
8367
+ sumq2 += wk[j] * q * q;
8368
+ }
8369
+ }
8370
+ q2[2*ib] = u;
8371
+ if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
8372
+ }
8373
+ memcpy(y[ibl].qs, q2, QK_K/4);
8374
+ }
8375
+ }
8376
+
8377
+ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
8378
+
8379
+ const int gindex = iq2_data_index(512);
8380
+
8381
+ const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
8382
+ const int * kmap_q2xs = iq2_data[gindex].map;
8383
+ const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
8384
+
8385
+ GGML_ASSERT(quant_weights);
8386
+ GGML_ASSERT(kmap_q2xs);
8387
+ GGML_ASSERT(kgrid_q2xs);
8388
+ GGML_ASSERT(kneighbors_q2xs);
8389
+ GGML_ASSERT(n%QK_K == 0);
8390
+
8391
+ const int kMaxQ = 3;
8392
+
8393
+ const int nbl = n/256;
8394
+
8395
+ block_iq2_xs * y = vy;
8396
+
8397
+ float scales[QK_K/16];
8398
+ float weight[16];
8399
+ float xval[16];
8400
+ int8_t L[16];
8401
+ int8_t Laux[16];
8402
+ float waux[16];
8403
+ bool is_on_grid[2];
8404
+ bool is_on_grid_aux[2];
8405
+ uint8_t block_signs[2];
8406
+ uint16_t q2[2*(QK_K/16)];
8407
+
8408
+ for (int ibl = 0; ibl < nbl; ++ibl) {
8409
+
8410
+ y[ibl].d = GGML_FP32_TO_FP16(0.f);
8411
+ memset(q2, 0, QK_K/4);
8412
+ memset(y[ibl].scales, 0, QK_K/32);
8413
+
8414
+ float max_scale = 0;
8415
+
8416
+ const float * xbl = x + QK_K*ibl;
8417
+ float sumx2 = 0;
8418
+ for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
8419
+ float sigma2 = sumx2/QK_K;
8420
+
8421
+ for (int ib = 0; ib < QK_K/16; ++ib) {
8422
+ const float * xb = xbl + 16*ib;
8423
+ const float * qw = quant_weights + QK_K*ibl + 16*ib;
8424
+ for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
8425
+ for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
8426
+ for (int k = 0; k < 2; ++k) {
8427
+ int nflip = 0;
8428
+ uint8_t s = 0;
8429
+ for (int i = 0; i < 8; ++i) {
8430
+ if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
8431
+ else {
8432
+ xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
8433
+ }
8434
+ }
8435
+ if (nflip%2) {
8436
+ int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
8437
+ for (int i = 1; i < 8; ++i) {
8438
+ float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
8439
+ if (ax < min) {
8440
+ min = ax; imin = i;
8441
+ }
8442
+ }
8443
+ xval[8*k+imin] = -xval[8*k+imin];
8444
+ s ^= (1 << imin);
8445
+ }
8446
+ block_signs[k] = s & 127;
8447
+ }
8448
+ float max = xval[0];
8449
+ for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
8450
+ if (!max) {
8451
+ scales[ib] = 0;
8452
+ memset(L, 0, 16);
8453
+ continue;
8454
+ }
8455
+ float best = 0;
8456
+ float scale = max/(2*kMaxQ-1);
8457
+ is_on_grid[0] = is_on_grid[1] = true;
8458
+ for (int is = -9; is <= 9; ++is) {
8459
+ float id = (2*kMaxQ-1+is*0.1f)/max;
8460
+ float this_scale = 1/id;
8461
+ for (int k = 0; k < 2; ++k) {
8462
+ for (int i = 0; i < 8; ++i) {
8463
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8464
+ Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
8465
+ }
8466
+ uint16_t u = 0;
8467
+ for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
8468
+ int grid_index = kmap_q2xs[u];
8469
+ is_on_grid_aux[k] = true;
8470
+ if (grid_index < 0) {
8471
+ is_on_grid_aux[k] = false;
8472
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8473
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
8474
+ }
8475
+ }
8476
+ float sumqx = 0, sumq2 = 0;
8477
+ for (int i = 0; i < 16; ++i) {
8478
+ float w = weight[i];
8479
+ float q = 2*Laux[i] + 1;
8480
+ sumqx += w*xval[i]*q;
8481
+ sumq2 += w*q*q;
8482
+ }
8483
+ if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
8484
+ scale = sumqx/sumq2; best = scale*sumqx;
8485
+ for (int i = 0; i < 16; ++i) L[i] = Laux[i];
8486
+ for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
8487
+ }
8488
+ }
8489
+ int n_not_ongrid = 0;
8490
+ for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
8491
+ if (n_not_ongrid > 0 && scale > 0) {
8492
+ float id = 1/scale;
8493
+ for (int k = 0; k < 2; ++k) {
8494
+ if (is_on_grid[k]) continue;
8495
+ uint16_t u = 0;
8496
+ for (int i = 0; i < 8; ++i) {
8497
+ int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
8498
+ l = MAX(0, MIN(kMaxQ-1, l));
8499
+ u |= (l << 2*i);
8500
+ L[8*k + i] = l;
8501
+ }
8502
+ int grid_index = kmap_q2xs[u];
8503
+ if (grid_index < 0) {
8504
+ const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
8505
+ grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
8506
+ }
8507
+ }
8508
+ float sumqx = 0, sumq2 = 0;
8509
+ for (int i = 0; i < 16; ++i) {
8510
+ float w = weight[i];
8511
+ float q = 2*L[i] + 1;
8512
+ sumqx += w*xval[i]*q;
8513
+ sumq2 += w*q*q;
8514
+ }
8515
+ if (sumq2 > 0) scale = sumqx/sumq2;
8516
+ }
8517
+ if (scale < 0) {
8518
+ scale = -scale;
8519
+ for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
8520
+ }
8521
+ for (int k = 0; k < 2; ++k) {
8522
+ uint16_t u = 0;
8523
+ for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
8524
+ int grid_index = kmap_q2xs[u];
8525
+ if (grid_index < 0) {
8526
+ printf("Oops: found point %u not on grid:", u);
8527
+ for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
8528
+ printf("\n");
8529
+ GGML_ASSERT(false);
8530
+ }
8531
+ q2[2*ib+k] = grid_index | (block_signs[k] << 9);
8532
+ }
8533
+ GGML_ASSERT(scale >= 0);
8534
+ scales[ib] = scale;
8535
+ max_scale = MAX(max_scale, scale);
8536
+ }
8537
+
8538
+ if (!max_scale) {
8539
+ memset(y[ibl].qs, 0, QK_K/4);
8540
+ continue;
8541
+ }
8542
+
8543
+ float d = max_scale/31;
8544
+ y[ibl].d = GGML_FP32_TO_FP16(d);
8545
+ float id = 1/d;
8546
+ for (int ib = 0; ib < QK_K/16; ++ib) {
8547
+ int l = nearest_int(0.5f*(id*scales[ib]-1));
8548
+ l = MAX(0, MIN(15, l));
8549
+ if (ib%2 == 0) y[ibl].scales[ib/2] = l;
8550
+ else y[ibl].scales[ib/2] |= (l << 4);
8551
+ }
8552
+ memcpy(y[ibl].qs, q2, QK_K/4);
8553
+
8554
+ }
8555
+ }
8556
+
8557
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
8558
+ (void)hist;
8559
+ GGML_ASSERT(n_per_row%QK_K == 0);
8560
+ int nblock = n_per_row/QK_K;
8561
+ char * qrow = (char *)dst;
8562
+ for (int row = 0; row < nrow; ++row) {
8563
+ quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
8564
+ src += n_per_row;
8565
+ qrow += nblock*sizeof(block_iq2_xxs);
8566
+ }
8567
+ return nrow * nblock * sizeof(block_iq2_xxs);
8568
+ }
8569
+
8570
+ size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
8571
+ (void)hist;
8572
+ GGML_ASSERT(n_per_row%QK_K == 0);
8573
+ int nblock = n_per_row/QK_K;
8574
+ char * qrow = (char *)dst;
8575
+ for (int row = 0; row < nrow; ++row) {
8576
+ quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
8577
+ src += n_per_row;
8578
+ qrow += nblock*sizeof(block_iq2_xs);
8579
+ }
8580
+ return nrow * nblock * sizeof(block_iq2_xs);
8581
+ }
8582
+
ggml-quants.h CHANGED
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
196
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
197
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
198
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
199
- void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
200
- void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
201
 
202
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
203
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
212
  void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
213
  void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
214
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
215
- void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
216
- void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
217
 
218
  // Dequantization
219
  void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
@@ -246,3 +242,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
246
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
247
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
248
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 
 
 
 
 
 
 
 
 
196
  void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
197
  void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
198
  void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
 
 
199
 
200
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
201
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
 
210
  void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
211
  void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
212
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
 
 
213
 
214
  // Dequantization
215
  void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
 
242
  void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
243
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
244
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
245
+
246
+ //
247
+ // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
248
+ //
249
+ size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
250
+ size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
251
+ size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
252
+
ggml.c CHANGED
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
585
  .type_size = sizeof(block_iq2_xxs),
586
  .is_quantized = true,
587
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
- .from_float = quantize_row_iq2_xxs,
589
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xxs_reference,
590
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
  .vec_dot_type = GGML_TYPE_Q8_K,
592
  },
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
596
  .type_size = sizeof(block_iq2_xs),
597
  .is_quantized = true,
598
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
- .from_float = quantize_row_iq2_xs,
600
- .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference,
601
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
  .vec_dot_type = GGML_TYPE_Q8_K,
603
  },
@@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
18665
  return (n/QK8_0*sizeof(block_q8_0));
18666
  }
18667
 
18668
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
 
 
18669
  size_t result = 0;
 
18670
  switch (type) {
18671
  case GGML_TYPE_Q4_0:
18672
  {
@@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18701
  case GGML_TYPE_Q2_K:
18702
  {
18703
  GGML_ASSERT(start % QK_K == 0);
18704
- block_q2_K * block = (block_q2_K*)dst + start / QK_K;
18705
- result = ggml_quantize_q2_K(src + start, block, n, n, hist);
 
 
 
18706
  } break;
18707
  case GGML_TYPE_Q3_K:
18708
  {
@@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
18731
  case GGML_TYPE_IQ2_XXS:
18732
  {
18733
  GGML_ASSERT(start % QK_K == 0);
18734
- block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K;
18735
- result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist);
 
 
 
 
18736
  } break;
18737
  case GGML_TYPE_IQ2_XS:
18738
  {
18739
  GGML_ASSERT(start % QK_K == 0);
18740
- block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K;
18741
- result = ggml_quantize_iq2_xs(src + start, block, n, n, hist);
 
 
 
 
18742
  } break;
18743
  case GGML_TYPE_F16:
18744
  {
 
585
  .type_size = sizeof(block_iq2_xxs),
586
  .is_quantized = true,
587
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
588
+ .from_float = NULL,
589
+ .from_float_reference = NULL,
590
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
591
  .vec_dot_type = GGML_TYPE_Q8_K,
592
  },
 
596
  .type_size = sizeof(block_iq2_xs),
597
  .is_quantized = true,
598
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
599
+ .from_float = NULL,
600
+ .from_float_reference = NULL,
601
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
602
  .vec_dot_type = GGML_TYPE_Q8_K,
603
  },
 
18665
  return (n/QK8_0*sizeof(block_q8_0));
18666
  }
18667
 
18668
+ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
18669
+ int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
18670
+ (void)imatrix;
18671
  size_t result = 0;
18672
+ int n = nrows * n_per_row;
18673
  switch (type) {
18674
  case GGML_TYPE_Q4_0:
18675
  {
 
18704
  case GGML_TYPE_Q2_K:
18705
  {
18706
  GGML_ASSERT(start % QK_K == 0);
18707
+ GGML_ASSERT(start % n_per_row == 0);
18708
+ size_t start_row = start / n_per_row;
18709
+ size_t row_size = ggml_row_size(type, n_per_row);
18710
+ result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18711
+ GGML_ASSERT(result == row_size * nrows);
18712
  } break;
18713
  case GGML_TYPE_Q3_K:
18714
  {
 
18737
  case GGML_TYPE_IQ2_XXS:
18738
  {
18739
  GGML_ASSERT(start % QK_K == 0);
18740
+ GGML_ASSERT(start % n_per_row == 0);
18741
+ GGML_ASSERT(imatrix);
18742
+ size_t start_row = start / n_per_row;
18743
+ size_t row_size = ggml_row_size(type, n_per_row);
18744
+ result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18745
+ GGML_ASSERT(result == row_size * nrows);
18746
  } break;
18747
  case GGML_TYPE_IQ2_XS:
18748
  {
18749
  GGML_ASSERT(start % QK_K == 0);
18750
+ GGML_ASSERT(start % n_per_row == 0);
18751
+ GGML_ASSERT(imatrix);
18752
+ size_t start_row = start / n_per_row;
18753
+ size_t row_size = ggml_row_size(type, n_per_row);
18754
+ result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
18755
+ GGML_ASSERT(result == row_size * nrows);
18756
  } break;
18757
  case GGML_TYPE_F16:
18758
  {
ggml.h CHANGED
@@ -2067,10 +2067,13 @@ extern "C" {
2067
  GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2068
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2069
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2070
- GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
2071
- GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
2072
 
2073
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
 
 
 
 
 
2074
 
2075
  //
2076
  // Importance matrix
 
2067
  GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2068
  GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2069
  GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
 
 
2070
 
2071
+ GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2072
+ int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
2073
+
2074
+ // These are needed for IQ2_XS and IQ2_XXS quantizations
2075
+ GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
2076
+ GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
2077
 
2078
  //
2079
  // Importance matrix