Akarshan Biswas commited on
Commit
4c88a27
·
1 Parent(s): e5107fe

SYCL: Implement few same quantized type copy kernels (llama/13739)

Browse files

* SYCL: Implement few same quantized type copy kernels

* Use memcpy for copying contiguous tensors

ggml-ci

* feat(sycl): add contiguous tensor copy support and device checks

Adds a memcpy path for contiguous tensors of the same type to optimize data transfer. Updates device support checks to recognize contiguous tensor operations, improving compatibility and performance.

* refactor: replace specific block copy functions with template

The changes replace multiple redundant block copy functions (e.g., cpy_block_q8_0_q8_0, cpy_block_q5_0_q5_0) with a single templated function cpy_blck_q_q. This reduces code duplication by using a generic template that works for any block type, improving maintainability while preserving the same functionality. The template is instantiated with specific block types (e.g., block_q8_0) where needed.

* Exclude BF16 support for COPY tensors for now
ggml-ci

* perf: adjust SYCL copy kernel block sizes for efficiency

Use ceil_div to ensure full element coverage and update nd_range parameters to better align with SYCL block sizes, improving parallelism and device utilization in copy operations.

ggml/src/ggml-sycl/cpy.cpp CHANGED
@@ -1,8 +1,12 @@
1
  #include "cpy.hpp"
2
 
3
  #include <float.h>
 
4
 
5
  #include "dequantize.hpp"
 
 
 
6
 
7
  static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
8
  if (x <= val[0]) {
@@ -116,6 +120,15 @@ static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
116
  }
117
  }
118
 
 
 
 
 
 
 
 
 
 
119
  static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
120
  float * cdstf = (float *) (cdsti);
121
 
@@ -311,6 +324,34 @@ template <dequantize_kernel_t dequant, int qk> static void cpy_blck_q_f32(const
311
  }
312
  }
313
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  template <cpy_kernel_t cpy_blck, int qk>
315
  static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
316
  const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
@@ -322,6 +363,7 @@ static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00
322
  return;
323
  }
324
 
 
325
  const int i03 = i / (ne00 * ne01 * ne02);
326
  const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
327
  const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
@@ -615,6 +657,70 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
615
  }
616
  }
617
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
619
  // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
620
  scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
@@ -632,8 +738,10 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
632
 
633
  char * src0_ddc = (char *) src0->data;
634
  char * src1_ddc = (char *) src1->data;
635
-
636
- if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
 
 
637
  ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
638
  nb11, nb12, nb13, main_stream);
639
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
@@ -684,6 +792,16 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
684
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
685
  ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
686
  nb10, nb11, nb12, nb13, main_stream);
 
 
 
 
 
 
 
 
 
 
687
  } else {
688
  GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
689
  ggml_type_name(src1->type));
 
1
  #include "cpy.hpp"
2
 
3
  #include <float.h>
4
+ #include <string>
5
 
6
  #include "dequantize.hpp"
7
+ #include "ggml-sycl/common.hpp"
8
+ #include "ggml-sycl/presets.hpp"
9
+ #include "ggml.h"
10
 
11
  static __dpct_inline__ int best_index_int8(int n, const int8_t * val, float x) {
12
  if (x <= val[0]) {
 
120
  }
121
  }
122
 
123
+ /* quantized type same copy */
124
+ template<typename T>
125
+ static void cpy_blck_q_q(const char * cxi, char * cdsti) {
126
+ const T * xi = (const T *) cxi;
127
+ T * dsti = (T *) cdsti;
128
+ *dsti = *xi;
129
+ }
130
+
131
+
132
  static void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
133
  float * cdstf = (float *) (cdsti);
134
 
 
324
  }
325
  }
326
 
327
+
328
+ template <typename T, int qk>
329
+ static void cpy_q_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
330
+ const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
331
+ const int ne12, const int nb10, const int nb11, const int nb12, const int nb13,
332
+ const sycl::nd_item<3> & item_ct1) {
333
+ const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2)) * qk;
334
+
335
+ if (i >= ne) {
336
+ return;
337
+ }
338
+
339
+ const int i03 = i / (ne00 * ne01 * ne02);
340
+ const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
341
+ const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
342
+ const int i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00 - i01 * ne00;
343
+ const int x_offset = (i00 / qk) * nb00 + i01 * nb01 + i02 * nb02 + i03 * nb03;
344
+
345
+
346
+ const int i13 = i / (ne10 * ne11 * ne12);
347
+ const int i12 = (i - i13 * ne10 * ne11 * ne12) / (ne10 * ne11);
348
+ const int i11 = (i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11) / ne10;
349
+ const int i10 = i - i13 * ne10 * ne11 * ne12 - i12 * ne10 * ne11 - i11 * ne10;
350
+ const int dst_offset = (i10 / qk) * nb10 + i11 * nb11 + i12 * nb12 + i13 * nb13;
351
+
352
+ cpy_blck_q_q<T>(cx + x_offset, cdst + dst_offset);
353
+ }
354
+
355
  template <cpy_kernel_t cpy_blck, int qk>
356
  static void cpy_f32_q(const char * cx, char * cdst, const int ne, const int ne00, const int ne01, const int ne02,
357
  const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11,
 
363
  return;
364
  }
365
 
366
+
367
  const int i03 = i / (ne00 * ne01 * ne02);
368
  const int i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
369
  const int i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne01 * ne00) / ne00;
 
657
  }
658
  }
659
 
660
+ static void ggml_cpy_q8_0_q8_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
661
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
662
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
663
+ const int nb12, const int nb13, queue_ptr stream) {
664
+ const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
665
+ stream->parallel_for(
666
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
667
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
668
+ cpy_q_q<block_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
669
+ });
670
+ }
671
+
672
+
673
+ static void ggml_cpy_q5_0_q5_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
674
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
675
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
676
+ const int nb12, const int nb13, queue_ptr stream) {
677
+ const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
678
+ stream->parallel_for(
679
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
680
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
681
+ cpy_q_q<block_q5_0, QK5_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
682
+ });
683
+ }
684
+
685
+
686
+ static void ggml_cpy_q5_1_q5_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
687
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
688
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
689
+ const int nb12, const int nb13, queue_ptr stream) {
690
+ const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
691
+
692
+ stream->parallel_for(
693
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
694
+ sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
695
+ cpy_q_q<block_q5_1, QK5_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
696
+ });
697
+ }
698
+
699
+
700
+ static void ggml_cpy_q4_0_q4_0(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
701
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
702
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
703
+ const int nb12, const int nb13, queue_ptr stream) {
704
+ const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
705
+ stream->parallel_for(
706
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
707
+ cpy_q_q<block_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
708
+ });
709
+ }
710
+
711
+
712
+ static void ggml_cpy_q4_1_q4_1(const char * cx, char * cdst, const int ne, const int ne00, const int ne01,
713
+ const int ne02, const int nb00, const int nb01, const int nb02, const int nb03,
714
+ const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
715
+ const int nb12, const int nb13, queue_ptr stream) {
716
+
717
+ const int num_blocks = ceil_div(ne, SYCL_CPY_BLOCK_SIZE);
718
+ stream->parallel_for(
719
+ sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), [=](sycl::nd_item<3> item_ct1) {
720
+ cpy_q_q<block_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, item_ct1);
721
+ });
722
+ }
723
+
724
  void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
725
  // Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
726
  scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
 
738
 
739
  char * src0_ddc = (char *) src0->data;
740
  char * src1_ddc = (char *) src1->data;
741
+ if ((src0->type == src1->type) && (ggml_is_contiguous(src0) && ggml_is_contiguous(src1))) {
742
+ GGML_SYCL_DEBUG("%s: memcpy path\n", __func__);
743
+ main_stream->memcpy(src1_ddc, src0_ddc, ggml_nbytes(src0));
744
+ } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
745
  ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
746
  nb11, nb12, nb13, main_stream);
747
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
 
792
  } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
793
  ggml_cpy_f32_iq4_nl_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12,
794
  nb10, nb11, nb12, nb13, main_stream);
795
+ } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_Q8_0) {
796
+ ggml_cpy_q8_0_q8_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
797
+ } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_Q5_0) {
798
+ ggml_cpy_q5_0_q5_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
799
+ } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_Q5_1) {
800
+ ggml_cpy_q5_1_q5_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
801
+ } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_Q4_0) {
802
+ ggml_cpy_q4_0_q4_0(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
803
+ } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_Q4_1) {
804
+ ggml_cpy_q4_1_q4_1(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
805
  } else {
806
  GGML_LOG_ERROR("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type),
807
  ggml_type_name(src1->type));
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -4226,6 +4226,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4226
  {
4227
  ggml_type src0_type = op->src[0]->type;
4228
  ggml_type src1_type = op->src[1]->type;
 
 
 
4229
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
4230
  return true;
4231
  }
@@ -4271,6 +4274,21 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4271
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
4272
  return true;
4273
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4274
  return false;
4275
  }
4276
  case GGML_OP_CONCAT:
 
4226
  {
4227
  ggml_type src0_type = op->src[0]->type;
4228
  ggml_type src1_type = op->src[1]->type;
4229
+ if (src0_type == src1_type && (ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) && src0_type != GGML_TYPE_BF16) {
4230
+ return true;
4231
+ }
4232
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
4233
  return true;
4234
  }
 
4274
  if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
4275
  return true;
4276
  }
4277
+ if(src0_type == GGML_TYPE_Q8_0 && src1_type == GGML_TYPE_Q8_0) {
4278
+ return true;
4279
+ }
4280
+ if(src0_type == GGML_TYPE_Q5_0 && src1_type == GGML_TYPE_Q5_0) {
4281
+ return true;
4282
+ }
4283
+ if(src0_type == GGML_TYPE_Q5_1 && src1_type == GGML_TYPE_Q5_1) {
4284
+ return true;
4285
+ }
4286
+ if(src0_type == GGML_TYPE_Q4_0 && src1_type == GGML_TYPE_Q4_0) {
4287
+ return true;
4288
+ }
4289
+ if(src0_type == GGML_TYPE_Q4_1 && src1_type == GGML_TYPE_Q4_1) {
4290
+ return true;
4291
+ }
4292
  return false;
4293
  }
4294
  case GGML_OP_CONCAT: