From 2171e023b7f1922176a7911fc3d8ab93bd4c40dd Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Thu, 30 Jan 2025 23:35:32 -0600
Subject: [PATCH] things braek

---
 Cargo.lock                                    | 12 ++--
 hercules_cg/src/gpu.rs                        | 61 ++++++++++---------
 hercules_samples/call/src/gpu.sch             |  3 +-
 hercules_samples/ccp/src/gpu.sch              |  3 +-
 hercules_samples/dot/src/gpu.sch              |  3 +-
 hercules_samples/fac/src/gpu.sch              |  3 +-
 hercules_samples/matmul/src/gpu.sch           |  3 +-
 juno_samples/antideps/src/cpu.sch             |  2 -
 juno_samples/antideps/src/gpu.sch             |  1 -
 juno_samples/casts_and_intrinsics/src/cpu.sch |  2 -
 juno_samples/casts_and_intrinsics/src/gpu.sch |  2 -
 juno_samples/cava/src/gpu.sch                 |  3 +-
 juno_samples/concat/src/cpu.sch               |  2 -
 juno_samples/concat/src/gpu.sch               |  2 -
 juno_samples/implicit_clone/src/cpu.sch       |  1 -
 juno_samples/matmul/src/gpu.sch               |  1 -
 juno_samples/nested_ccp/src/gpu.sch           |  1 -
 juno_samples/simple3/src/gpu.sch              |  1 -
 18 files changed, 50 insertions(+), 56 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 623fc35c..303b1b78 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1206,9 +1206,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
 [[package]]
 name = "libfuzzer-sys"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa"
+checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75"
 dependencies = [
  "arbitrary",
  "cc",
@@ -2163,9 +2163,9 @@ dependencies = [
 
 [[package]]
 name = "toml_edit"
-version = "0.22.22"
+version = "0.22.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5"
+checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee"
 dependencies = [
  "indexmap",
  "serde",
@@ -2433,9 +2433,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
 [[package]]
 name = "winnow"
-version = "0.6.24"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a"
+checksum = "7e49d2d35d3fad69b39b94139037ecfb4f359f08958b9c11e7315ce770462419"
 dependencies = [
  "memchr",
 ]
diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index a266deea..55f8f83c 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -149,8 +149,8 @@ pub fn gpu_codegen<W: Write>(
     }
 
     let return_parameter = if collection_objects.returned_objects().len() == 1 {
-        Some(collection_objects.origin(*collection_objects.returned_objects()
-            .first().unwrap()).try_parameter().unwrap())
+        collection_objects.origin(*collection_objects.returned_objects()
+            .first().unwrap()).try_parameter()
     } else {
         None
     };
@@ -568,11 +568,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             panic!("Expected fork node");
         };
         let reduces = &self.fork_reduce_map[root_fork];
-        assert!(reduces.iter().all(|reduce| {
-            self.collection_objects.objects(*reduce).iter().all(|object| {
-                self.collection_objects.origin(*object).try_parameter().is_some()
-            })
-        }), "All collection reduces in block fork must originate from parameters");
         if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork)
         {
             let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * ");
@@ -977,34 +972,44 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
             // Parameters emitted at top
             Node::Parameter { index: _ } => {}
             // If the constant is primitive, it's stored in register so we repeat
-            // for all threads. Otherwise, it's stored in shared memory so we only
-            // want to "allocate" and initialize it once.
+            // for all threads. Otherwise, it can be inside or outside block fork.
+            // If inside, it's stored in shared memory so we only want to "allocate"
+            // and initialize it once. In either case, we then parallelize memset to 0.
             Node::Constant { id: cons_id } => {
                 let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive();
-                if !is_primitive {
-                    let cg_tile = {
-                        let KernelState::OutBlock = state else {
-                            panic!("Expected constant to be in start basic block
-                            outside any fork");
-                        };
-                        "block".to_string()
-                    };
+                let cg_tile = match state {
+                    KernelState::OutBlock | KernelState::InBlock => "block".to_string(),
+                    KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId),
+                };
+                if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() {
+                    panic!("GPU can't memset collection for multi-block grid");
+                }
+                if !is_primitive && state != KernelState::OutBlock {
                     write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?;
                     *num_tabs += 1;
                 }
-                self.codegen_constant(
-                    define_variable,
-                    *cons_id,
-                    true,
-                    Some(extra_dim_collects),
-                    dynamic_shared_offset,
-                    w,
-                    *num_tabs,
-                )?;
-                if !is_primitive {
+                if is_primitive || state != KernelState::OutBlock {
+                    self.codegen_constant(
+                        define_variable.clone(),
+                        *cons_id,
+                        true,
+                        Some(extra_dim_collects),
+                        dynamic_shared_offset,
+                        w,
+                        *num_tabs,
+                    )?;
+                }
+                if !is_primitive && state != KernelState::OutBlock {
                     write!(w, "{}}}\n", tabs)?;
                     *num_tabs -= 1;
                 }
+                if !is_primitive {
+                    let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects));
+                    write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?;
+                    write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?;
+                    write!(w, "{}}}\n", tabs)?;
+                    write!(w, "{}{}.sync();\n", tabs, cg_tile)?;
+                }
             }
             // Dynamic constants emitted at top
             Node::DynamicConstant { id: _ } => {}
@@ -1212,7 +1217,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()]));
                 let data_variable = self.get_value(*data, false, false);
                 let data_type_id = self.typing[data.idx()];
-                if KernelState::OutBlock == state && is_block_parallel.unwrap() {
+                if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() {
                     panic!("GPU can't guarantee correctness for multi-block collection writes");
                 }
                 let cg_tile = match state {
diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch
index 1e654e22..6c10c2ce 100644
--- a/hercules_samples/call/src/gpu.sch
+++ b/hercules_samples/call/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.add);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+gcm(*);
diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch
index d8f6a2d0..2852b7a4 100644
--- a/hercules_samples/ccp/src/gpu.sch
+++ b/hercules_samples/ccp/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.tricky);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+gcm(*);
diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch
index 4adbf530..4ec3aaef 100644
--- a/hercules_samples/dot/src/gpu.sch
+++ b/hercules_samples/dot/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.dot);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+gcm(*);
diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch
index 1885854c..6eea1273 100644
--- a/hercules_samples/fac/src/gpu.sch
+++ b/hercules_samples/fac/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.fac);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+gcm(*);
diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch
index 9a714789..ca6cdbb9 100644
--- a/hercules_samples/matmul/src/gpu.sch
+++ b/hercules_samples/matmul/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.matmul);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch
index 9c2c44a8..7e6be7ee 100644
--- a/juno_samples/antideps/src/cpu.sch
+++ b/juno_samples/antideps/src/cpu.sch
@@ -14,7 +14,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch
index 25dba2e7..e166515d 100644
--- a/juno_samples/antideps/src/gpu.sch
+++ b/juno_samples/antideps/src/gpu.sch
@@ -18,4 +18,3 @@ gcm(*);
 float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch
index 9c2c44a8..7e6be7ee 100644
--- a/juno_samples/casts_and_intrinsics/src/cpu.sch
+++ b/juno_samples/casts_and_intrinsics/src/cpu.sch
@@ -14,7 +14,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch
index f051ed8c..64d063be 100644
--- a/juno_samples/casts_and_intrinsics/src/gpu.sch
+++ b/juno_samples/casts_and_intrinsics/src/gpu.sch
@@ -15,7 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index bb91af72..ace9082c 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -13,10 +13,9 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
+// forkify(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*)
-
diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch
index 7b87070a..8ec730d7 100644
--- a/juno_samples/concat/src/cpu.sch
+++ b/juno_samples/concat/src/cpu.sch
@@ -15,7 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch
index 71bed4b4..7bfc6dbe 100644
--- a/juno_samples/concat/src/gpu.sch
+++ b/juno_samples/concat/src/gpu.sch
@@ -16,7 +16,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch
index ebf9d8fe..7e6be7ee 100644
--- a/juno_samples/implicit_clone/src/cpu.sch
+++ b/juno_samples/implicit_clone/src/cpu.sch
@@ -14,6 +14,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index dd2dc14c..3d3f919c 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -18,4 +18,3 @@ gcm(*);
 float-collections(*);
 dce(*);
 gcm(*);
-
diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch
index 69e18343..4f36ddd8 100644
--- a/juno_samples/nested_ccp/src/gpu.sch
+++ b/juno_samples/nested_ccp/src/gpu.sch
@@ -15,6 +15,5 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
-float-collections(*);
 dce(*);
 gcm(*);
diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch
index d27e5831..93e85c48 100644
--- a/juno_samples/simple3/src/gpu.sch
+++ b/juno_samples/simple3/src/gpu.sch
@@ -16,5 +16,4 @@ infer-schedules(*);
 
 gcm(*);
 dce(*);
-float-collections(*);
 gcm(*);
-- 
GitLab