diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 309866b3..34c45f38 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4747,11 +4747,11 @@ static  __global__ void im2col_f32_f16(
         (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
         (blockIdx.x * (blockDim.y * blockDim.z) + threadIdx.y * blockDim.z + threadIdx.z);
 
-    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = __float2half(0.0f);
+    } else {
         const int offset_src =  threadIdx.x * ofs0 + blockIdx.x * ofs1;
         dst[offset_dst] = __float2half(x[offset_src + iih * IW + iiw]);
-    } else {
-        dst[offset_dst] = __float2half(0.0f);
     }
 }
 
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 4fdcaac9..5d1357cd 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1327,11 +1327,11 @@ kernel void kernel_im2col_f16(
         (tpitg[0] * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * CHW +
         (tgpig[0] * (ntg[1] * ntg[2]) + tpitg[1] * ntg[2] + tpitg[2]);
 
-    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] = 0.0f;
+    } else {
         const int32_t offset_src = tpitg[0] * ofs0 + tgpig[0] * ofs1;
         dst[offset_dst] = x[offset_src + iih * IW + iiw];
-    } else {
-        dst[offset_dst] = 0.0f;
     }
 }
 
diff --git a/ggml.c b/ggml.c
index 2723c5be..584ee468 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1777,7 +1777,6 @@ static void ggml_setup_op_has_task_pass(void) {
         p[GGML_OP_DIAG_MASK_INF          ] = true;
         p[GGML_OP_DIAG_MASK_ZERO         ] = true;
         p[GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[GGML_OP_IM2COL                 ] = true;
         p[GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -5122,8 +5121,6 @@ static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p,
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
 
-// ggml_conv_1d
-
 GGML_API struct ggml_tensor * ggml_conv_1d(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
@@ -5263,14 +5260,14 @@ struct ggml_tensor * ggml_conv_2d(
         int                  p1,
         int                  d0,
         int                  d1) {
-    struct ggml_tensor * result = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
 
-    result =
-        ggml_reshape_4d(ctx,
-                ggml_mul_mat(ctx,
-                    ggml_reshape_2d(ctx, result, result->ne[0],  result->ne[3] * result->ne[2] * result->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
-                    ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3])),                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-                result->ne[1], result->ne[2], a->ne[3], result->ne[3]);                                          // [N, OC, OH, OW]
+    struct ggml_tensor * result =
+        ggml_mul_mat(ctx,
+                ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
+
+    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
 
     return result;
 }
@@ -11757,7 +11754,6 @@ static void ggml_compute_forward_im2col_f16(
     GGML_ASSERT(nb10 == sizeof(float));
 
     if (params->type == GGML_TASK_INIT) {
-        memset(dst->data, 0, ggml_nbytes(dst));
         return;
     }
 
@@ -11783,7 +11779,9 @@ static void ggml_compute_forward_im2col_f16(
                                 const int64_t iiw = iow*s0 + ikw*d0 - p0;
                                 const int64_t iih = ioh*s1 + ikh*d1 - p1;
 
-                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
                                     dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
                                 }
                             }