cognitedata · thorkildcognite · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 17, 2024
@@ -1,6 +1,7 @@
 package cognite.spark.v1
 
 import cats.effect.IO
+import cats.effect.std.Backpressure
 import cats.implicits._
 import cognite.spark.v1.PushdownUtilities.getTimestampLimit
 import com.codahale.metrics.Counter
@@ -287,22 +288,13 @@ class RawTableRelation(
     dfWithUnRenamedKeyColumns.foreachPartition((rows: Iterator[Row]) => {
       config.maxOutstandingRawInsertRequests match {
         case Some(maxOutstandingRawInsertRequests) =>
-          // We first group by batch size of a write, and then group that by the number of allowed parallel
-          // outstanding requests to avoid queueing up too many requests towards the RAW API (and this potentially
-          // leading to an OutOfMemory)
-          // Note: This is a suboptimal fix, as if one of the requests in a batch is slow, we will not
-          // start on the next batch (this limitation used to be per partition). Instead, we should
-          // have a cats.effect.std.Semaphore permit with X number of outstanding requests
-          // or cats.effect.concurrent.Backpressure.
-          rows
-            .grouped(batchSize)
-            .toSeq
-            .grouped(maxOutstandingRawInsertRequests)
-            .foreach { batch =>
-              batch.toVector
-                .parTraverse_(postRows(columnNames, _))
-                .unsafeRunSync()
+          Backpressure[IO](Backpressure.Strategy.Lossless, maxOutstandingRawInsertRequests)
+            .flatMap { backpressure =>
+              rows.grouped(batchSize).toVector.parTraverse_ { batch: Seq[Row] =>
+                backpressure.metered(postRows(columnNames, batch))
+              }
             }
+            .unsafeRunSync()
         case None =>
           // Same behavior as before, which is prone to OutOfMemory if the RAW API calls are too slow
           // to finish