From 2171e023b7f1922176a7911fc3d8ab93bd4c40dd Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Thu, 30 Jan 2025 23:35:32 -0600 Subject: [PATCH] things braek --- Cargo.lock | 12 ++-- hercules_cg/src/gpu.rs | 61 ++++++++++--------- hercules_samples/call/src/gpu.sch | 3 +- hercules_samples/ccp/src/gpu.sch | 3 +- hercules_samples/dot/src/gpu.sch | 3 +- hercules_samples/fac/src/gpu.sch | 3 +- hercules_samples/matmul/src/gpu.sch | 3 +- juno_samples/antideps/src/cpu.sch | 2 - juno_samples/antideps/src/gpu.sch | 1 - juno_samples/casts_and_intrinsics/src/cpu.sch | 2 - juno_samples/casts_and_intrinsics/src/gpu.sch | 2 - juno_samples/cava/src/gpu.sch | 3 +- juno_samples/concat/src/cpu.sch | 2 - juno_samples/concat/src/gpu.sch | 2 - juno_samples/implicit_clone/src/cpu.sch | 1 - juno_samples/matmul/src/gpu.sch | 1 - juno_samples/nested_ccp/src/gpu.sch | 1 - juno_samples/simple3/src/gpu.sch | 1 - 18 files changed, 50 insertions(+), 56 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 623fc35c..303b1b78 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1206,9 +1206,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libfuzzer-sys" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa" +checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75" dependencies = [ "arbitrary", "cc", @@ -2163,9 +2163,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ "indexmap", "serde", @@ -2433,9 +2433,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "7e49d2d35d3fad69b39b94139037ecfb4f359f08958b9c11e7315ce770462419" dependencies = [ "memchr", ] diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a266deea..55f8f83c 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -149,8 +149,8 @@ pub fn gpu_codegen<W: Write>( } let return_parameter = if collection_objects.returned_objects().len() == 1 { - Some(collection_objects.origin(*collection_objects.returned_objects() - .first().unwrap()).try_parameter().unwrap()) + collection_objects.origin(*collection_objects.returned_objects() + .first().unwrap()).try_parameter() } else { None }; @@ -568,11 +568,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Expected fork node"); }; let reduces = &self.fork_reduce_map[root_fork]; - assert!(reduces.iter().all(|reduce| { - self.collection_objects.objects(*reduce).iter().all(|object| { - self.collection_objects.origin(*object).try_parameter().is_some() - }) - }), "All collection reduces in block fork must originate from parameters"); if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); @@ -977,34 +972,44 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // Parameters emitted at top Node::Parameter { index: _ } => {} // If the constant is primitive, it's stored in register so we repeat - // for all threads. Otherwise, it's stored in shared memory so we only - // want to "allocate" and initialize it once. + // for all threads. Otherwise, it can be inside or outside block fork. + // If inside, it's stored in shared memory so we only want to "allocate" + // and initialize it once. In either case, we then parallelize memset to 0. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); - if !is_primitive { - let cg_tile = { - let KernelState::OutBlock = state else { - panic!("Expected constant to be in start basic block - outside any fork"); - }; - "block".to_string() - }; + let cg_tile = match state { + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), + }; + if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() { + panic!("GPU can't memset collection for multi-block grid"); + } + if !is_primitive && state != KernelState::OutBlock { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; } - self.codegen_constant( - define_variable, - *cons_id, - true, - Some(extra_dim_collects), - dynamic_shared_offset, - w, - *num_tabs, - )?; - if !is_primitive { + if is_primitive || state != KernelState::OutBlock { + self.codegen_constant( + define_variable.clone(), + *cons_id, + true, + Some(extra_dim_collects), + dynamic_shared_offset, + w, + *num_tabs, + )?; + } + if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; *num_tabs -= 1; } + if !is_primitive { + let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + } } // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} @@ -1212,7 +1217,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && is_block_parallel.unwrap() { + if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() { panic!("GPU can't guarantee correctness for multi-block collection writes"); } let cg_tile = match state { diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch index 1e654e22..6c10c2ce 100644 --- a/hercules_samples/call/src/gpu.sch +++ b/hercules_samples/call/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.add); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch index d8f6a2d0..2852b7a4 100644 --- a/hercules_samples/ccp/src/gpu.sch +++ b/hercules_samples/ccp/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.tricky); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index 4adbf530..4ec3aaef 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.dot); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch index 1885854c..6eea1273 100644 --- a/hercules_samples/fac/src/gpu.sch +++ b/hercules_samples/fac/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.fac); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index 9a714789..ca6cdbb9 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch index 9c2c44a8..7e6be7ee 100644 --- a/juno_samples/antideps/src/cpu.sch +++ b/juno_samples/antideps/src/cpu.sch @@ -14,7 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch index 25dba2e7..e166515d 100644 --- a/juno_samples/antideps/src/gpu.sch +++ b/juno_samples/antideps/src/gpu.sch @@ -18,4 +18,3 @@ gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch index 9c2c44a8..7e6be7ee 100644 --- a/juno_samples/casts_and_intrinsics/src/cpu.sch +++ b/juno_samples/casts_and_intrinsics/src/cpu.sch @@ -14,7 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index f051ed8c..64d063be 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -15,7 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index bb91af72..ace9082c 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -13,10 +13,9 @@ gvn(*); phi-elim(*); dce(*); +// forkify(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*) - diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch index 7b87070a..8ec730d7 100644 --- a/juno_samples/concat/src/cpu.sch +++ b/juno_samples/concat/src/cpu.sch @@ -15,7 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 71bed4b4..7bfc6dbe 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -16,7 +16,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch index ebf9d8fe..7e6be7ee 100644 --- a/juno_samples/implicit_clone/src/cpu.sch +++ b/juno_samples/implicit_clone/src/cpu.sch @@ -14,6 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index dd2dc14c..3d3f919c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -18,4 +18,3 @@ gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 69e18343..4f36ddd8 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -15,6 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index d27e5831..93e85c48 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -16,5 +16,4 @@ infer-schedules(*); gcm(*); dce(*); -float-collections(*); gcm(*); -- GitLab