From 948fe3b976233754ff4eecb6aa7b7b1fcdaf33b5 Mon Sep 17 00:00:00 2001 From: Russel Arbore <rarbore2@illinois.edu> Date: Thu, 30 Jan 2025 19:46:06 -0600 Subject: [PATCH] before get exposed by forkify --- hercules_cg/src/gpu.rs | 9 +- hercules_cg/src/rt.rs | 2 +- hercules_samples/dot/src/main.rs | 6 +- juno_samples/antideps/build.rs | 9 +- juno_samples/antideps/src/cpu.sch | 20 +++++ juno_samples/antideps/src/gpu.sch | 5 +- juno_samples/casts_and_intrinsics/build.rs | 9 +- juno_samples/casts_and_intrinsics/src/cpu.sch | 20 +++++ juno_samples/casts_and_intrinsics/src/gpu.sch | 5 +- juno_samples/cava/src/cava.jn | 20 ++++- juno_samples/cava/src/gpu.sch | 10 ++- juno_samples/cava/src/main.rs | 86 +++++++++++++------ juno_samples/concat/src/concat.jn | 6 +- juno_samples/concat/src/cpu.sch | 8 +- juno_samples/concat/src/gpu.sch | 9 +- juno_samples/concat/src/main.rs | 10 +-- juno_samples/implicit_clone/build.rs | 9 +- juno_samples/implicit_clone/src/cpu.sch | 19 ++++ juno_samples/implicit_clone/src/gpu.sch | 4 +- juno_samples/matmul/build.rs | 9 +- juno_samples/matmul/src/cpu.sch | 21 +++++ juno_samples/matmul/src/gpu.sch | 5 +- juno_samples/nested_ccp/build.rs | 9 +- juno_samples/nested_ccp/src/cpu.sch | 19 ++++ juno_samples/nested_ccp/src/gpu.sch | 4 +- juno_samples/nested_ccp/src/main.rs | 6 +- juno_samples/simple3/build.rs | 9 +- juno_samples/simple3/src/cpu.sch | 19 ++++ juno_samples/simple3/src/gpu.sch | 4 +- 29 files changed, 264 insertions(+), 107 deletions(-) create mode 100644 juno_samples/antideps/src/cpu.sch create mode 100644 juno_samples/casts_and_intrinsics/src/cpu.sch create mode 100644 juno_samples/implicit_clone/src/cpu.sch create mode 100644 juno_samples/matmul/src/cpu.sch create mode 100644 juno_samples/nested_ccp/src/cpu.sch create mode 100644 juno_samples/simple3/src/cpu.sch diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs index ce52a20e..a266deea 100644 --- a/hercules_cg/src/gpu.rs +++ b/hercules_cg/src/gpu.rs @@ -1280,14 +1280,17 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; let mut succs = self.control_subgraph.succs(id); let succ1 = succs.next().unwrap(); let succ2 = succs.next().unwrap(); + let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some(); + let succ1_block_name = self.get_block_name(succ1, false); + let succ2_block_name = self.get_block_name(succ2, false); write!( w_term, "\tif ({}) {{\n", self.get_value(*cond, false, false) )?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?; write!(w_term, "\t}} else {{\n")?; - write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?; + write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?; write!(w_term, "\t}}\n")?; 1 } @@ -1590,7 +1593,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?; size } else { format!( - "({} + {} - 1) / {}) * {} + {}", + "({} + {} - 1) / {} * {} + {}", acc, align, align, align, size ) } diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs index 3b35f73e..4237cc84 100644 --- a/hercules_cg/src/rt.rs +++ b/hercules_cg/src/rt.rs @@ -387,7 +387,7 @@ impl<'a> RTContext<'a> { { write!(block, "backing_{}.byte_add(", device.name())?; self.codegen_dynamic_constant(offset, block)?; - write!(block, ")")? + write!(block, "), ")? } for dc in dynamic_constants { self.codegen_dynamic_constant(*dc, block)?; diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs index 4e651fa8..8862c11a 100644 --- a/hercules_samples/dot/src/main.rs +++ b/hercules_samples/dot/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("dot"); fn main() { async_std::task::block_on(async { - let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; - let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; #[cfg(not(feature = "cuda"))] { + let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; let a = HerculesCPURef::from_slice(&a); + let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; let b = HerculesCPURef::from_slice(&b); let mut r = runner!(dot); let c = r.run(8, a, b).await; @@ -21,8 +21,10 @@ fn main() { } #[cfg(feature = "cuda")] { + let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0]; let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); let a = a_box.get_ref(); + let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0]; let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let b = b_box.get_ref(); let mut r = runner!(dot); diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs index 92b30c43..8e261270 100644 --- a/juno_samples/antideps/build.rs +++ b/juno_samples/antideps/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("antideps.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("antideps.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch new file mode 100644 index 00000000..9c2c44a8 --- /dev/null +++ b/juno_samples/antideps/src/cpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch index d3f4a6c2..25dba2e7 100644 --- a/juno_samples/antideps/src/gpu.sch +++ b/juno_samples/antideps/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_a ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs index e43a2ac8..5d25fbba 100644 --- a/juno_samples/casts_and_intrinsics/build.rs +++ b/juno_samples/casts_and_intrinsics/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("casts_and_intrinsics.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("casts_and_intrinsics.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch new file mode 100644 index 00000000..9c2c44a8 --- /dev/null +++ b/juno_samples/casts_and_intrinsics/src/cpu.sch @@ -0,0 +1,20 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch index b2fb3449..f051ed8c 100644 --- a/juno_samples/casts_and_intrinsics/src/gpu.sch +++ b/juno_samples/casts_and_intrinsics/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.casts_and_intrinsics); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn index f3096ec3..ab4fbe59 100644 --- a/juno_samples/cava/src/cava.jn +++ b/juno_samples/cava/src/cava.jn @@ -116,7 +116,25 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r filter[i, j] = input[chan, r + i - 1, c + j - 1]; } } - res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter); + + let tmp : f32[9]; + for r = 0 to 3 { + for c = 0 to 3 { + tmp[r * 3 + c] = filter[r, c]; + } + } + + for i = 0 to 9 - 1 { + for j = 0 to 9 - i - 1 { + if tmp[j] > tmp[j+1] { + let t : f32 = tmp[j]; + tmp[j] = tmp[j+1]; + tmp[j+1] = t; + } + } + } + + res[chan, r, c] = tmp[9 / 2]; } else { res[chan, r, c] = input[chan, r, c]; } diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch index 07f71c99..bb91af72 100644 --- a/juno_samples/cava/src/gpu.sch +++ b/juno_samples/cava/src/gpu.sch @@ -2,13 +2,13 @@ gvn(*); phi-elim(*); dce(*); -gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale); +inline(*); +let out = auto-outline(*); +gpu(out.cava); ip-sroa(*); sroa(*); - dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +16,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*) + diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs index e05808f9..482bbf8d 100644 --- a/juno_samples/cava/src/main.rs +++ b/juno_samples/cava/src/main.rs @@ -9,11 +9,15 @@ use self::cava_rust::CHAN; use self::image_proc::*; use hercules_rt::{runner, HerculesCPURef}; +#[cfg(feature = "cuda")] +use hercules_rt::CUDABox; use image::ImageError; use clap::Parser; +use std::mem; + juno_build::juno!("cava"); fn run_cava( @@ -27,39 +31,67 @@ fn run_cava( coefs: &[f32], tonemap: &[f32], ) -> Box<[u8]> { - assert_eq!(image.len(), CHAN * rows * cols); - let image = HerculesCPURef::from_slice(image); + assert_eq!(image.len(), CHAN * rows * cols); assert_eq!(tstw.len(), CHAN * CHAN); - let tstw = HerculesCPURef::from_slice(tstw); - assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN); - let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); - assert_eq!(weights.len(), num_ctrl_pts * CHAN); - let weights = HerculesCPURef::from_slice(weights); - assert_eq!(coefs.len(), 4 * CHAN); - let coefs = HerculesCPURef::from_slice(coefs); - assert_eq!(tonemap.len(), 256 * CHAN); - let tonemap = HerculesCPURef::from_slice(tonemap); - - let mut r = runner!(cava); - async_std::task::block_on(async { - r.run( - rows as u64, - cols as u64, - num_ctrl_pts as u64, - image, - tstw, - ctrl_pts, - weights, - coefs, - tonemap, - ) - .await - }).as_slice::<u8>().to_vec().into_boxed_slice() + + #[cfg(not(feature = "cuda"))] + { + let image = HerculesCPURef::from_slice(image); + let tstw = HerculesCPURef::from_slice(tstw); + let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts); + let weights = HerculesCPURef::from_slice(weights); + let coefs = HerculesCPURef::from_slice(coefs); + let tonemap = HerculesCPURef::from_slice(tonemap); + let mut r = runner!(cava); + async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image, + tstw, + ctrl_pts, + weights, + coefs, + tonemap, + ) + .await + }).as_slice::<u8>().to_vec().into_boxed_slice() + } + + #[cfg(feature = "cuda")] + { + let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image)); + let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw)); + let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts)); + let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights)); + let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs)); + let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap)); + let mut r = runner!(cava); + let res = async_std::task::block_on(async { + r.run( + rows as u64, + cols as u64, + num_ctrl_pts as u64, + image.get_ref(), + tstw.get_ref(), + ctrl_pts.get_ref(), + weights.get_ref(), + coefs.get_ref(), + tonemap.get_ref(), + ) + .await + }); + let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() }; + let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice(); + res.to_cpu_ref(&mut res_cpu); + res_cpu + } } enum Error { diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn index b9806c93..d901e7e1 100644 --- a/juno_samples/concat/src/concat.jn +++ b/juno_samples/concat/src/concat.jn @@ -18,7 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t { } #[entry] -fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 { - let arr3 = concat::<i32, 3, 6>(arr1, arr2); - return sum::<i32, 9>(arr3); +fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 { + let arr3 = concat::<i32, a, b>(arr1, arr2); + return sum::<i32, a + b>(arr3); } diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch index 680adaeb..7b87070a 100644 --- a/juno_samples/concat/src/cpu.sch +++ b/juno_samples/concat/src/cpu.sch @@ -2,12 +2,12 @@ gvn(*); phi-elim(*); dce(*); -cpu(concat, sum); +inline(*); +auto-outline(*); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -15,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch index 8ee4ef0e..71bed4b4 100644 --- a/juno_samples/concat/src/gpu.sch +++ b/juno_samples/concat/src/gpu.sch @@ -2,12 +2,13 @@ gvn(*); phi-elim(*); dce(*); -gpu(concat, sum); +inline(*); +let out = auto-outline(*); +gpu(out.concat_entry); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -15,3 +16,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs index d0929fbf..78932421 100644 --- a/juno_samples/concat/src/main.rs +++ b/juno_samples/concat/src/main.rs @@ -10,14 +10,13 @@ juno_build::juno!("concat"); fn main() { async_std::task::block_on(async { let mut r = runner!(concat_entry); + let mut a_data = [7, 7, 0]; + let mut b_data = [7, 7, 0, 0, 7, 7]; #[cfg(not(feature = "cuda"))] { - let mut a_data = [7, 7, 0]; let a = HerculesCPURef::from_slice(&mut a_data); - let mut b_data = [7, 7, 0, 0, 7, 7]; let b = HerculesCPURef::from_slice(&mut b_data); - let output = r.run(a, b).await; - println!("{}", output); + let output = r.run(3, 6, a, b).await; assert_eq!(output, 42); } #[cfg(feature = "cuda")] @@ -26,8 +25,7 @@ fn main() { let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data)); let mut b_data = [7, 7, 0, 0, 7, 7]; let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data)); - let output = r.run(a.get_ref(), b.get_ref()).await; - println!("{}", output); + let output = r.run(3, 6, a.get_ref(), b.get_ref()).await; assert_eq!(output, 42); } }); diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs index dc134e59..a464568d 100644 --- a/juno_samples/implicit_clone/build.rs +++ b/juno_samples/implicit_clone/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("implicit_clone.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("implicit_clone.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch new file mode 100644 index 00000000..ebf9d8fe --- /dev/null +++ b/juno_samples/implicit_clone/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch index 443fc778..0f7c8021 100644 --- a/juno_samples/implicit_clone/src/gpu.sch +++ b/juno_samples/implicit_clone/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs index ff3e3d8c..c7f18a99 100644 --- a/juno_samples/matmul/build.rs +++ b/juno_samples/matmul/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("matmul.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("matmul.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch new file mode 100644 index 00000000..412e8cbb --- /dev/null +++ b/juno_samples/matmul/src/cpu.sch @@ -0,0 +1,21 @@ +gvn(*); +phi-elim(*); +dce(*); + +let out = auto-outline(*); +cpu(out.matmul, out.tiled_64_matmul); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch index e85dafdf..dd2dc14c 100644 --- a/juno_samples/matmul/src/gpu.sch +++ b/juno_samples/matmul/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.matmul, out.tiled_64_matmul); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,7 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); + diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs index 2352ddef..ec111bc1 100644 --- a/juno_samples/nested_ccp/build.rs +++ b/juno_samples/nested_ccp/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("nested_ccp.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("nested_ccp.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/nested_ccp/src/cpu.sch new file mode 100644 index 00000000..ebf9d8fe --- /dev/null +++ b/juno_samples/nested_ccp/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch index 021a05e3..69e18343 100644 --- a/juno_samples/nested_ccp/src/gpu.sch +++ b/juno_samples/nested_ccp/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.ccp_example, out.median_array, out.no_underflow); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +float-collections(*); +dce(*); +gcm(*); diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs index 412d56a4..99ef150d 100644 --- a/juno_samples/nested_ccp/src/main.rs +++ b/juno_samples/nested_ccp/src/main.rs @@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp"); fn main() { async_std::task::block_on(async { - let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); + let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]); let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]); #[cfg(not(feature = "cuda"))] { - let a = HerculesCPURef::from_slice(&a); + let a = HerculesCPURefMut::from_slice(&mut a); let b = HerculesCPURefMut::from_slice(&mut b); let mut r = runner!(ccp_example); let output_example = r.run(a).await; @@ -23,7 +23,7 @@ fn main() { } #[cfg(feature = "cuda")] { - let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a)); + let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a)); let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b)); let mut r = runner!(ccp_example); let output_example = r.run(a.get_ref_mut()).await; diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs index a0874af7..bfd37cb5 100644 --- a/juno_samples/simple3/build.rs +++ b/juno_samples/simple3/build.rs @@ -1,17 +1,10 @@ use juno_build::JunoCompiler; fn main() { - #[cfg(feature = "cuda")] JunoCompiler::new() .file_in_src("simple3.jn") .unwrap() - .schedule_in_src("gpu.sch") - .unwrap() - .build() - .unwrap(); - #[cfg(not(feature = "cuda"))] - JunoCompiler::new() - .file_in_src("simple3.jn") + .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" }) .unwrap() .build() .unwrap(); diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch new file mode 100644 index 00000000..d933f69c --- /dev/null +++ b/juno_samples/simple3/src/cpu.sch @@ -0,0 +1,19 @@ +gvn(*); +phi-elim(*); +dce(*); + +auto-outline(*); + +ip-sroa(*); +sroa(*); +dce(*); +gvn(*); +phi-elim(*); +dce(*); + +infer-schedules(*); + +gcm(*); +dce(*); +float-collections(*); +gcm(*); diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch index e97627d4..d27e5831 100644 --- a/juno_samples/simple3/src/gpu.sch +++ b/juno_samples/simple3/src/gpu.sch @@ -8,7 +8,6 @@ gpu(out.simple3); ip-sroa(*); sroa(*); dce(*); -float-collections(*); gvn(*); phi-elim(*); dce(*); @@ -16,3 +15,6 @@ dce(*); infer-schedules(*); gcm(*); +dce(*); +float-collections(*); +gcm(*); -- GitLab