diff --git a/Cargo.lock b/Cargo.lock index 623fc35c9260676fc9b683bd63e96ac7cbc31a2c..303b1b78116c921043f2240dd71725ecc777fa33 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1206,9 +1206,9 @@ checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "libfuzzer-sys" -version = "0.4.8" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa" +checksum = "cf78f52d400cf2d84a3a973a78a592b4adc535739e0a5597a0da6f0c357adc75" dependencies = [ "arbitrary", "cc", @@ -2163,9 +2163,9 @@ dependencies = [ [[package]] name = "toml_edit" -version = "0.22.22" +version = "0.22.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ae48d6208a266e853d946088ed816055e556cc6028c5e8e2b84d9fa5dd7c7f5" +checksum = "02a8b472d1a3d7c18e2d61a489aee3453fd9031c33e4f55bd533f4a7adca1bee" dependencies = [ "indexmap", "serde", @@ -2433,9 +2433,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "7e49d2d35d3fad69b39b94139037ecfb4f359f08958b9c11e7315ce770462419" dependencies = [ "memchr", ] diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index a266deea40e187afbf84bb52aa91e96f569001be..55f8f83c6698722defe59c652464fbe3482636d3 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -149,8 +149,8 @@ pub fn gpu_codegen<W: Write>( } let return_parameter = if collection_objects.returned_objects().len() == 1 { - Some(collection_objects.origin(*collection_objects.returned_objects() - .first().unwrap()).try_parameter().unwrap()) + collection_objects.origin(*collection_objects.returned_objects() + .first().unwrap()).try_parameter() } else { None }; @@ -568,11 +568,6 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; panic!("Expected fork node"); }; let reduces = &self.fork_reduce_map[root_fork]; - assert!(reduces.iter().all(|reduce| { - self.collection_objects.objects(*reduce).iter().all(|object| { - self.collection_objects.origin(*object).try_parameter().is_some() - }) - }), "All collection reduces in block fork must originate from parameters"); if self.function.schedules[root_fork.idx()].contains(&Schedule::ParallelFork) { let fork_size = factors.iter().map(|dc| format!("dc{}", dc.idx())).collect::<Vec<_>>().join(" * "); @@ -977,34 +972,44 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; // Parameters emitted at top Node::Parameter { index: _ } => {} // If the constant is primitive, it's stored in register so we repeat - // for all threads. Otherwise, it's stored in shared memory so we only - // want to "allocate" and initialize it once. + // for all threads. Otherwise, it can be inside or outside block fork. + // If inside, it's stored in shared memory so we only want to "allocate" + // and initialize it once. In either case, we then parallelize memset to 0. Node::Constant { id: cons_id } => { let is_primitive = self.types[self.typing[id.idx()].idx()].is_primitive(); - if !is_primitive { - let cg_tile = { - let KernelState::OutBlock = state else { - panic!("Expected constant to be in start basic block - outside any fork"); - }; - "block".to_string() - }; + let cg_tile = match state { + KernelState::OutBlock | KernelState::InBlock => "block".to_string(), + KernelState::InThread => self.get_cg_tile(nesting_fork.unwrap(), CGType::UsePerId), + }; + if !is_primitive && state == KernelState::OutBlock && is_block_parallel.is_some() && is_block_parallel.unwrap() { + panic!("GPU can't memset collection for multi-block grid"); + } + if !is_primitive && state != KernelState::OutBlock { write!(w, "{}if ({}.thread_rank() == 0) {{\n", tabs, cg_tile)?; *num_tabs += 1; } - self.codegen_constant( - define_variable, - *cons_id, - true, - Some(extra_dim_collects), - dynamic_shared_offset, - w, - *num_tabs, - )?; - if !is_primitive { + if is_primitive || state != KernelState::OutBlock { + self.codegen_constant( + define_variable.clone(), + *cons_id, + true, + Some(extra_dim_collects), + dynamic_shared_offset, + w, + *num_tabs, + )?; + } + if !is_primitive && state != KernelState::OutBlock { write!(w, "{}}}\n", tabs)?; *num_tabs -= 1; } + if !is_primitive { + let data_size = self.get_size(self.typing[id.idx()], None, Some(extra_dim_collects)); + write!(w, "{}for (int i = {}.thread_rank(); i < {}; i += {}.size()) {{\n", tabs, cg_tile, data_size, cg_tile)?; + write!(w, "{}\t*({} + i) = 0;\n", tabs, define_variable)?; + write!(w, "{}}}\n", tabs)?; + write!(w, "{}{}.sync();\n", tabs, cg_tile)?; + } } // Dynamic constants emitted at top Node::DynamicConstant { id: _ } => {} @@ -1212,7 +1217,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let collect_with_indices = self.codegen_collect(*collect, indices, extra_dim_collects.contains(&self.typing[collect.idx()])); let data_variable = self.get_value(*data, false, false); let data_type_id = self.typing[data.idx()]; - if KernelState::OutBlock == state && is_block_parallel.unwrap() { + if KernelState::OutBlock == state && is_block_parallel.is_some() && is_block_parallel.unwrap() { panic!("GPU can't guarantee correctness for multi-block collection writes"); } let cg_tile = match state { diff --git a/hercules_samples/call/src/gpu.sch b/hercules_samples/call/src/gpu.sch index 1e654e22664a5b9c79a20888d1f013f9256d7d82..6c10c2ce91f1833a9a0b5cdff8feb1b3326bfed2 100644 --- a/hercules_samples/call/src/gpu.sch +++ b/hercules_samples/call/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.add); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/ccp/src/gpu.sch b/hercules_samples/ccp/src/gpu.sch index d8f6a2d00171255a1596d73dc9bf4ecff7ff308a..2852b7a4c3df9332188f3cea5c1e18393b909e6c 100644 --- a/hercules_samples/ccp/src/gpu.sch +++ b/hercules_samples/ccp/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.tricky); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/dot/src/gpu.sch b/hercules_samples/dot/src/gpu.sch index 4adbf530a7ce8277cd8c691349f36de65ba4e251..4ec3aaef7b786f61dfb67f11815f8cc08d641066 100644 --- a/hercules_samples/dot/src/gpu.sch +++ b/hercules_samples/dot/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.dot); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/fac/src/gpu.sch b/hercules_samples/fac/src/gpu.sch index 1885854ca2d2b05e000d64165244080060b2a4f9..6eea1273d027ff58619c02e9b666246322399551 100644 --- a/hercules_samples/fac/src/gpu.sch +++ b/hercules_samples/fac/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.fac); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/hercules_samples/matmul/src/gpu.sch b/hercules_samples/matmul/src/gpu.sch index 9a714789ac768405a2782cac6e68338c2f0697bb..ca6cdbb9a155e64def57e5a41abe9145268577e9 100644 --- a/hercules_samples/matmul/src/gpu.sch +++ b/hercules_samples/matmul/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,5 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +gcm(*); diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch index 9c2c44a8da96406d0cc42a028dcf7ce38fefecdf..7e6be7eefb05b7660d3c27ae9d937ce00cf79a0e 100644 --- a/juno_samples/antideps/src/cpu.sch +++ b/juno_samples/antideps/src/cpu.sch @@ -14,7 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch index 25dba2e7ae565d0600c53ae5fff0d403a5e4f3bc..e166515dc5562f2e142792229fea92309a42e526 100644 --- a/juno_samples/antideps/src/gpu.sch +++ b/juno_samples/antideps/src/gpu.sch @@ -18,4 +18,3 @@ gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch index 9c2c44a8da96406d0cc42a028dcf7ce38fefecdf..7e6be7eefb05b7660d3c27ae9d937ce00cf79a0e 100644 --- a/juno_samples/casts_and_intrinsics/src/cpu.sch +++ b/juno_samples/casts_and_intrinsics/src/cpu.sch @@ -14,7 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index f051ed8c4aacceb8f86404c3e7cc6e9d140b68de..64d063be12c839e81bb3f942d2a201809096c4ea 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -15,7 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index bb91af7271bfc218001c77d64bb7d3f7a7888fbb..ace9082cbdb6a5b5512d88f435169df8eaf1238a 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -13,10 +13,9 @@ gvn(*); phi-elim(*); dce(*); +// forkify(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*) - diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch index 7b87070aa8624cd7aeb01ffa610e7654eb06a42e..8ec730d7c7901d3763c04f8cadbe89f8ca187cb4 100644 --- a/juno_samples/concat/src/cpu.sch +++ b/juno_samples/concat/src/cpu.sch @@ -15,7 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 71bed4b4a8913d4edbeadbc58b2ce4ed2492ebc6..7bfc6dbe1a45581aa7ed92b804f46ad4f9a572bd 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -16,7 +16,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch index ebf9d8fee2514fe72e91979cf7ef3f9d90ac91e3..7e6be7eefb05b7660d3c27ae9d937ce00cf79a0e 100644 --- a/juno_samples/implicit_clone/src/cpu.sch +++ b/juno_samples/implicit_clone/src/cpu.sch @@ -14,6 +14,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index dd2dc14c064b8e4e9b1ed639507972ec6d507a84..3d3f919cd26e4eba480540df06f839a8b86976b0 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -18,4 +18,3 @@ gcm(*); float-collections(*); dce(*); gcm(*); - diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 69e18343665142516174a7a04ed8e8f5fd536d74..4f36ddd8102209a7a9da5223069d6617c499a4b0 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -15,6 +15,5 @@ dce(*); infer-schedules(*); gcm(*); -float-collections(*); dce(*); gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index d27e58311603dc6186a3b7a474117fd76766ca09..93e85c48ad442a6a9ed17cdf3e59bbe67989b73d 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -16,5 +16,4 @@ infer-schedules(*); gcm(*); dce(*); -float-collections(*); gcm(*);