diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index ce52a20e9a0d2e3961f4ab7865ac5b5763df9ae8..a266deea40e187afbf84bb52aa91e96f569001be 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1280,14 +1280,17 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let mut succs = self.control_subgraph.succs(id); let succ1 = succs.next().unwrap(); let succ2 = succs.next().unwrap(); + let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some(); + let succ1_block_name = self.get_block_name(succ1, false); + let succ2_block_name = self.get_block_name(succ2, false); write!( w_term, "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?; write!(w_term, "\t}}\n")?; 1 } @@ -1590,7 +1593,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; size } else { format!( - "({} + {} - 1) / {}) * {} + {}", + "({} + {} - 1) / {} * {} + {}", acc, align, align, align, size ) } diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 3b35f73ed370984753fa8e416f3b26c24951283f..4237cc8496ab5cf33cb4702f6ed85539860c0f95 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -387,7 +387,7 @@ impl<'a> RTContext<'a> { { write!(block, "backing_{}.byte_add(", device.name())?; self.codegen_dynamic_constant(offset, block)?; - write!(block, ")")? + write!(block, "), ")? } for dc in dynamic_constants { self.codegen_dynamic_constant(*dc, block)?; diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 4e651fa8986f07dfcc086b67ea70650746d05ea1..8862c11a9273f9808f2148f1067dcf3f5953c11f 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; - let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; #[cfg(not(feature = "cuda"))] { + let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; let a = HerculesCPURef::from_slice(&a); + let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; let b = HerculesCPURef::from_slice(&b); let mut r = runner!(dot); let c = r.run(8, a, b).await; @@ -21,8 +21,10 @@ fn main() { } #[cfg(feature = "cuda")] { + let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); let a = a_box.get_ref(); + let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let b = b_box.get_ref(); let mut r = runner!(dot); diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 92b30c43b385812c02f2685ab3cdc76950dd6960..8e26127040d08c4819f4307dbf6ebae296710257 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("antideps.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("antideps.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..9c2c44a8da96406d0cc42a028dcf7ce38fefecdf --- /dev/null +++ b/juno_samples/antideps/src/cpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch index d3f4a6c28ccefa44b2dd474e7f54546f1a21eab6..25dba2e7ae565d0600c53ae5fff0d403a5e4f3bc 100644 --- a/juno_samples/antideps/src/gpu.sch +++ b/juno_samples/antideps/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_a ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index e43a2ac82f2dc60dc1198a9884cce617aa3d70f5..5d25fbba54843b131f944928fdec120e8c30048e 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("casts_and_intrinsics.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("casts_and_intrinsics.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..9c2c44a8da96406d0cc42a028dcf7ce38fefecdf --- /dev/null +++ b/juno_samples/casts_and_intrinsics/src/cpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index b2fb3449954ca567ef3b24adebcf5610763a687e..f051ed8c4aacceb8f86404c3e7cc6e9d140b68de 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.casts_and_intrinsics); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index f3096ec34230ad838ca803e8d8de79c176aa9bd2..ab4fbe594b227483d2835c638bdbfba5f450e7cd 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -116,7 +116,25 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r filter[i, j] = input[chan, r + i - 1, c + j - 1]; } } - res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter); + + let tmp : f32[9]; + for r = 0 to 3 { + for c = 0 to 3 { + tmp[r * 3 + c] = filter[r, c]; + } + } + + for i = 0 to 9 - 1 { + for j = 0 to 9 - i - 1 { + if tmp[j] > tmp[j+1] { + let t : f32 = tmp[j]; + tmp[j] = tmp[j+1]; + tmp[j+1] = t; + } + } + } + + res[chan, r, c] = tmp[9 / 2]; } else { res[chan, r, c] = input[chan, r, c]; } diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index 07f71c99302c86a9ec3dc5c813b854fe3af983b0..bb91af7271bfc218001c77d64bb7d3f7a7888fbb 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -2,13 +2,13 @@ gvn(*); phi-elim(*); dce(*); -gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale); +inline(*); +let out = auto-outline(*); +gpu(out.cava); ip-sroa(*); sroa(*); - dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +16,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*) + diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index e05808f9e0c415dd3c3af0e480af905c04e0152c..482bbf8deb0af323255b004a7e33e70202acb886 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -9,11 +9,15 @@ use self::cava_rust::CHAN; use self::image_proc::*; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; use image::ImageError; use clap::Parser; +use std::mem; + juno_build::juno!("cava"); fn run_cava( @@ -27,39 +31,67 @@ fn run_cava( coefs: &[f32], tonemap: &[f32], ) -> Box<[u8]> { - assert_eq!(image.len(), CHAN * rows * cols); - let image = HerculesCPURef::from_slice(image); + assert_eq!(image.len(), CHAN * rows * cols); assert_eq!(tstw.len(), CHAN * CHAN); - let tstw = HerculesCPURef::from_slice(tstw); - assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN); - let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); - assert_eq!(weights.len(), num_ctrl_pts * CHAN); - let weights = HerculesCPURef::from_slice(weights); - assert_eq!(coefs.len(), 4 * CHAN); - let coefs = HerculesCPURef::from_slice(coefs); - assert_eq!(tonemap.len(), 256 * CHAN); - let tonemap = HerculesCPURef::from_slice(tonemap); - - let mut r = runner!(cava); - async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image, - tstw, - ctrl_pts, - weights, - coefs, - tonemap, - ) - .await - }).as_slice::<u8>().to_vec().into_boxed_slice() + + #[cfg(not(feature = "cuda"))] + { + let image = HerculesCPURef::from_slice(image); + let tstw = HerculesCPURef::from_slice(tstw); + let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); + let weights = HerculesCPURef::from_slice(weights); + let coefs = HerculesCPURef::from_slice(coefs); + let tonemap = HerculesCPURef::from_slice(tonemap); + let mut r = runner!(cava); + async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image, + tstw, + ctrl_pts, + weights, + coefs, + tonemap, + ) + .await + }).as_slice::<u8>().to_vec().into_boxed_slice() + } + + #[cfg(feature = "cuda")] + { + let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image)); + let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw)); + let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts)); + let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights)); + let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs)); + let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap)); + let mut r = runner!(cava); + let res = async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image.get_ref(), + tstw.get_ref(), + ctrl_pts.get_ref(), + weights.get_ref(), + coefs.get_ref(), + tonemap.get_ref(), + ) + .await + }); + let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() }; + let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice(); + res.to_cpu_ref(&mut res_cpu); + res_cpu + } } enum Error { diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn index b9806c9348e73a3467f29f2f39c981b8f347037c..d901e7e17c527edcbcb01929b14adfcb37e6c642 100644 --- a/juno_samples/concat/src/concat.jn +++ b/juno_samples/concat/src/concat.jn @@ -18,7 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t { } #[entry] -fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 { - let arr3 = concat::<i32, 3, 6>(arr1, arr2); - return sum::<i32, 9>(arr3); +fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 { + let arr3 = concat::<i32, a, b>(arr1, arr2); + return sum::<i32, a + b>(arr3); } diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch index 680adaeb2070b26b68fba361aafb64e5af2afc03..7b87070aa8624cd7aeb01ffa610e7654eb06a42e 100644 --- a/juno_samples/concat/src/cpu.sch +++ b/juno_samples/concat/src/cpu.sch @@ -2,12 +2,12 @@ gvn(*); phi-elim(*); dce(*); -cpu(concat, sum); +inline(*); +auto-outline(*); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -15,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 8ee4ef0e5627aa6a51cb5e86dccd014f18dcfd96..71bed4b4a8913d4edbeadbc58b2ce4ed2492ebc6 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -2,12 +2,13 @@ gvn(*); phi-elim(*); dce(*); -gpu(concat, sum); +inline(*); +let out = auto-outline(*); +gpu(out.concat_entry); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -15,3 +16,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index d0929fbf0a3d5d5073b51f79ee8ab575208eb7cc..78932421df9890fbebfa4e136b4f85f875166130 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -10,14 +10,13 @@ juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; #[cfg(not(feature = "cuda"))] { - let mut a_data = [7, 7, 0]; let a = HerculesCPURef::from_slice(&mut a_data); - let mut b_data = [7, 7, 0, 0, 7, 7]; let b = HerculesCPURef::from_slice(&mut b_data); - let output = r.run(a, b).await; - println!("{}", output); + let output = r.run(3, 6, a, b).await; assert_eq!(output, 42); } #[cfg(feature = "cuda")] @@ -26,8 +25,7 @@ fn main() { let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); let mut b_data = [7, 7, 0, 0, 7, 7]; let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); - let output = r.run(a.get_ref(), b.get_ref()).await; - println!("{}", output); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; assert_eq!(output, 42); } }); diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index dc134e59757a301b2be13ab61ed4d2feb252d1fe..a464568d08ffbdb7426b66b044173f981bf42d8e 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("implicit_clone.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("implicit_clone.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..ebf9d8fee2514fe72e91979cf7ef3f9d90ac91e3 --- /dev/null +++ b/juno_samples/implicit_clone/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch index 443fc778fe6b6f4489d6c5a339b691f488f917c2..0f7c80213d20481ce51628736f0ed07be21da399 100644 --- a/juno_samples/implicit_clone/src/gpu.sch +++ b/juno_samples/implicit_clone/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index ff3e3d8c79091349f9d76716f3d7496bba3d5503..c7f18a99c47239eab72e207f7c1cb8195f45a9f6 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("matmul.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("matmul.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..412e8cbb31c3affbcc1cdd6c5a3c085a93804d74 --- /dev/null +++ b/juno_samples/matmul/src/cpu.sch @@ -0,0 +1,21 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +cpu(out.matmul, out.tiled_64_matmul); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index e85dafdfdbfac2021b0215f625fb8077972be6b4..dd2dc14c064b8e4e9b1ed639507972ec6d507a84 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.matmul, out.tiled_64_matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index 2352ddef0a432d42a09fef72042c516843799290..ec111bc1b9d57eda7416d89d7df06caeb60bc258 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("nested_ccp.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("nested_ccp.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/nested_ccp/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..ebf9d8fee2514fe72e91979cf7ef3f9d90ac91e3 --- /dev/null +++ b/juno_samples/nested_ccp/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 021a05e3f361e4f5d1ae679c33f24282d431b2cc..69e18343665142516174a7a04ed8e8f5fd536d74 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.ccp_example, out.median_array, out.no_underflow); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 412d56a4f4e66af5a6608822219bce16ac5554dc..99ef150d942d256c4475344822efcfb0cb6f693a 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); #[cfg(not(feature = "cuda"))] { - let a = HerculesCPURef::from_slice(&a); + let a = HerculesCPURefMut::from_slice(&mut a); let b = HerculesCPURefMut::from_slice(&mut b); let mut r = runner!(ccp_example); let output_example = r.run(a).await; @@ -23,7 +23,7 @@ fn main() { } #[cfg(feature = "cuda")] { - let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let mut r = runner!(ccp_example); let output_example = r.run(a.get_ref_mut()).await; diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index a0874af7ecb42b29eea3aaaa5628d1c0ddeb2090..bfd37cb50d74bf09118b7b768600d132d7fbf9e1 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("simple3.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("simple3.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch new file mode 100644 index 0000000000000000000000000000000000000000..d933f69c025454f094f67b03711da4a7d23e4d18 --- /dev/null +++ b/juno_samples/simple3/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +dce(*); +float-collections(*); +gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index e97627d43c0e1dccf67f565c686712acb1f080f8..d27e58311603dc6186a3b7a474117fd76766ca09 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple3); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +float-collections(*); +gcm(*);