From 948fe3b976233754ff4eecb6aa7b7b1fcdaf33b5 Mon Sep 17 00:00:00 2001
From: Russel Arbore <rarbore2@illinois.edu>
Date: Thu, 30 Jan 2025 19:46:06 -0600
Subject: [PATCH] before get exposed by forkify

---
 hercules_cg/src/gpu.rs                        |  9 +-
 hercules_cg/src/rt.rs                         |  2 +-
 hercules_samples/dot/src/main.rs              |  6 +-
 juno_samples/antideps/build.rs                |  9 +-
 juno_samples/antideps/src/cpu.sch             | 20 +++++
 juno_samples/antideps/src/gpu.sch             |  5 +-
 juno_samples/casts_and_intrinsics/build.rs    |  9 +-
 juno_samples/casts_and_intrinsics/src/cpu.sch | 20 +++++
 juno_samples/casts_and_intrinsics/src/gpu.sch |  5 +-
 juno_samples/cava/src/cava.jn                 | 20 ++++-
 juno_samples/cava/src/gpu.sch                 | 10 ++-
 juno_samples/cava/src/main.rs                 | 86 +++++++++++++------
 juno_samples/concat/src/concat.jn             |  6 +-
 juno_samples/concat/src/cpu.sch               |  8 +-
 juno_samples/concat/src/gpu.sch               |  9 +-
 juno_samples/concat/src/main.rs               | 10 +--
 juno_samples/implicit_clone/build.rs          |  9 +-
 juno_samples/implicit_clone/src/cpu.sch       | 19 ++++
 juno_samples/implicit_clone/src/gpu.sch       |  4 +-
 juno_samples/matmul/build.rs                  |  9 +-
 juno_samples/matmul/src/cpu.sch               | 21 +++++
 juno_samples/matmul/src/gpu.sch               |  5 +-
 juno_samples/nested_ccp/build.rs              |  9 +-
 juno_samples/nested_ccp/src/cpu.sch           | 19 ++++
 juno_samples/nested_ccp/src/gpu.sch           |  4 +-
 juno_samples/nested_ccp/src/main.rs           |  6 +-
 juno_samples/simple3/build.rs                 |  9 +-
 juno_samples/simple3/src/cpu.sch              | 19 ++++
 juno_samples/simple3/src/gpu.sch              |  4 +-
 29 files changed, 264 insertions(+), 107 deletions(-)
 create mode 100644 juno_samples/antideps/src/cpu.sch
 create mode 100644 juno_samples/casts_and_intrinsics/src/cpu.sch
 create mode 100644 juno_samples/implicit_clone/src/cpu.sch
 create mode 100644 juno_samples/matmul/src/cpu.sch
 create mode 100644 juno_samples/nested_ccp/src/cpu.sch
 create mode 100644 juno_samples/simple3/src/cpu.sch

diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index ce52a20e..a266deea 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1280,14 +1280,17 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 let mut succs = self.control_subgraph.succs(id);
                 let succ1 = succs.next().unwrap();
                 let succ2 = succs.next().unwrap();
+                let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some();
+                let succ1_block_name = self.get_block_name(succ1, false);
+                let succ2_block_name = self.get_block_name(succ2, false);
                 write!(
                     w_term,
                     "\tif ({}) {{\n",
                     self.get_value(*cond, false, false)
                 )?;
-                write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?;
+                write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?;
                 write!(w_term, "\t}} else {{\n")?;
-                write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?;
+                write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?;
                 write!(w_term, "\t}}\n")?;
                 1
             }
@@ -1590,7 +1593,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                             size
                         } else {
                             format!(
-                                "({} + {} - 1) / {}) * {} + {}",
+                                "({} + {} - 1) / {} * {} + {}",
                                 acc, align, align, align, size
                             )
                         }
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 3b35f73e..4237cc84 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -387,7 +387,7 @@ impl<'a> RTContext<'a> {
                 {
                     write!(block, "backing_{}.byte_add(", device.name())?;
                     self.codegen_dynamic_constant(offset, block)?;
-                    write!(block, ")")?
+                    write!(block, "), ")?
                 }
                 for dc in dynamic_constants {
                     self.codegen_dynamic_constant(*dc, block)?;
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 4e651fa8..8862c11a 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -8,11 +8,11 @@ juno_build::juno!("dot");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
-        let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
         #[cfg(not(feature = "cuda"))]
         {
+            let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
             let a = HerculesCPURef::from_slice(&a);
+            let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
             let b = HerculesCPURef::from_slice(&b);
             let mut r = runner!(dot);
             let c = r.run(8, a, b).await;
@@ -21,8 +21,10 @@ fn main() {
         }
         #[cfg(feature = "cuda")]
         {
+            let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
             let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
             let a = a_box.get_ref();
+            let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
             let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let b = b_box.get_ref();
             let mut r = runner!(dot);
diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs
index 92b30c43..8e261270 100644
--- a/juno_samples/antideps/build.rs
+++ b/juno_samples/antideps/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("antideps.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("antideps.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch
new file mode 100644
index 00000000..9c2c44a8
--- /dev/null
+++ b/juno_samples/antideps/src/cpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch
index d3f4a6c2..25dba2e7 100644
--- a/juno_samples/antideps/src/gpu.sch
+++ b/juno_samples/antideps/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_a
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs
index e43a2ac8..5d25fbba 100644
--- a/juno_samples/casts_and_intrinsics/build.rs
+++ b/juno_samples/casts_and_intrinsics/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("casts_and_intrinsics.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("casts_and_intrinsics.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch
new file mode 100644
index 00000000..9c2c44a8
--- /dev/null
+++ b/juno_samples/casts_and_intrinsics/src/cpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch
index b2fb3449..f051ed8c 100644
--- a/juno_samples/casts_and_intrinsics/src/gpu.sch
+++ b/juno_samples/casts_and_intrinsics/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.casts_and_intrinsics);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index f3096ec3..ab4fbe59 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -116,7 +116,25 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r
               filter[i, j] = input[chan, r + i - 1, c + j - 1];
             }
           }
-          res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter);
+
+	  let tmp : f32[9];
+	  for  r = 0 to 3 {
+	    for c = 0 to 3 {
+	      tmp[r * 3 + c] = filter[r, c];
+	    }
+	  }
+
+	  for i = 0 to 9 - 1 {
+	    for j = 0 to 9 - i - 1 {
+	      if tmp[j] > tmp[j+1] {
+		let t : f32 = tmp[j];
+		tmp[j] = tmp[j+1];
+		tmp[j+1] = t;
+	      }
+	    }
+	  }
+
+  	  res[chan, r, c] = tmp[9 / 2];
         } else {
           res[chan, r, c] = input[chan, r, c];
         }
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index 07f71c99..bb91af72 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -2,13 +2,13 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale);
+inline(*);
+let out = auto-outline(*);
+gpu(out.cava);
 
 ip-sroa(*);
 sroa(*);
-
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +16,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*)
+
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index e05808f9..482bbf8d 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -9,11 +9,15 @@ use self::cava_rust::CHAN;
 use self::image_proc::*;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 use image::ImageError;
 
 use clap::Parser;
 
+use std::mem;
+
 juno_build::juno!("cava");
 
 fn run_cava(
@@ -27,39 +31,67 @@ fn run_cava(
     coefs: &[f32],
     tonemap: &[f32],
 ) -> Box<[u8]> {
-    assert_eq!(image.len(), CHAN * rows * cols);
-    let image = HerculesCPURef::from_slice(image);
 
+    assert_eq!(image.len(), CHAN * rows * cols);
     assert_eq!(tstw.len(), CHAN * CHAN);
-    let tstw = HerculesCPURef::from_slice(tstw);
-
     assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN);
-    let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
-
     assert_eq!(weights.len(), num_ctrl_pts * CHAN);
-    let weights = HerculesCPURef::from_slice(weights);
-
     assert_eq!(coefs.len(), 4 * CHAN);
-    let coefs = HerculesCPURef::from_slice(coefs);
-
     assert_eq!(tonemap.len(), 256 * CHAN);
-    let tonemap = HerculesCPURef::from_slice(tonemap);
-
-    let mut r = runner!(cava);
-    async_std::task::block_on(async {
-        r.run(
-            rows as u64,
-            cols as u64,
-            num_ctrl_pts as u64,
-            image,
-            tstw,
-            ctrl_pts,
-            weights,
-            coefs,
-            tonemap,
-        )
-        .await
-    }).as_slice::<u8>().to_vec().into_boxed_slice()
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        let image = HerculesCPURef::from_slice(image);
+        let tstw = HerculesCPURef::from_slice(tstw);
+        let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
+        let weights = HerculesCPURef::from_slice(weights);
+        let coefs = HerculesCPURef::from_slice(coefs);
+        let tonemap = HerculesCPURef::from_slice(tonemap);
+	    let mut r = runner!(cava);
+	    async_std::task::block_on(async {
+		r.run(
+		    rows as u64,
+		    cols as u64,
+		    num_ctrl_pts as u64,
+		    image,
+		    tstw,
+		    ctrl_pts,
+		    weights,
+		    coefs,
+		    tonemap,
+		)
+		.await
+	    }).as_slice::<u8>().to_vec().into_boxed_slice()
+    }
+
+    #[cfg(feature = "cuda")]
+    {
+        let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image));
+        let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw));
+        let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts));
+        let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
+        let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
+        let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
+	    let mut r = runner!(cava);
+	    let res = async_std::task::block_on(async {
+            r.run(
+                rows as u64,
+                cols as u64,
+                num_ctrl_pts as u64,
+                image.get_ref(),
+                tstw.get_ref(),
+                ctrl_pts.get_ref(),
+                weights.get_ref(),
+                coefs.get_ref(),
+                tonemap.get_ref(),
+            )
+            .await
+	    });
+        let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
+        let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
+        res.to_cpu_ref(&mut res_cpu);
+        res_cpu
+    }
 }
 
 enum Error {
diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn
index b9806c93..d901e7e1 100644
--- a/juno_samples/concat/src/concat.jn
+++ b/juno_samples/concat/src/concat.jn
@@ -18,7 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t {
 }
 
 #[entry]
-fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 {
-  let arr3 = concat::<i32, 3, 6>(arr1, arr2);
-  return sum::<i32, 9>(arr3);
+fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 {
+  let arr3 = concat::<i32, a, b>(arr1, arr2);
+  return sum::<i32, a + b>(arr3);
 }
diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch
index 680adaeb..7b87070a 100644
--- a/juno_samples/concat/src/cpu.sch
+++ b/juno_samples/concat/src/cpu.sch
@@ -2,12 +2,12 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-cpu(concat, sum);
+inline(*);
+auto-outline(*);
 
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -15,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch
index 8ee4ef0e..71bed4b4 100644
--- a/juno_samples/concat/src/gpu.sch
+++ b/juno_samples/concat/src/gpu.sch
@@ -2,12 +2,13 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-gpu(concat, sum);
+inline(*);
+let out = auto-outline(*);
+gpu(out.concat_entry);
 
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -15,3 +16,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs
index d0929fbf..78932421 100644
--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
@@ -10,14 +10,13 @@ juno_build::juno!("concat");
 fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(concat_entry);
+        let mut a_data = [7, 7, 0];
+        let mut b_data = [7, 7, 0, 0, 7, 7];
         #[cfg(not(feature = "cuda"))]
         {
-            let mut a_data = [7, 7, 0];
             let a = HerculesCPURef::from_slice(&mut a_data);
-            let mut b_data = [7, 7, 0, 0, 7, 7];
             let b = HerculesCPURef::from_slice(&mut b_data);
-            let output = r.run(a, b).await;
-            println!("{}", output);
+            let output = r.run(3, 6, a, b).await;
             assert_eq!(output, 42);
         }
         #[cfg(feature = "cuda")]
@@ -26,8 +25,7 @@ fn main() {
             let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data));
             let mut b_data = [7, 7, 0, 0, 7, 7];
             let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data));
-            let output = r.run(a.get_ref(), b.get_ref()).await;
-            println!("{}", output);
+            let output = r.run(3, 6, a.get_ref(), b.get_ref()).await;
             assert_eq!(output, 42);
         }
     });
diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs
index dc134e59..a464568d 100644
--- a/juno_samples/implicit_clone/build.rs
+++ b/juno_samples/implicit_clone/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("implicit_clone.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("implicit_clone.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch
new file mode 100644
index 00000000..ebf9d8fe
--- /dev/null
+++ b/juno_samples/implicit_clone/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch
index 443fc778..0f7c8021 100644
--- a/juno_samples/implicit_clone/src/gpu.sch
+++ b/juno_samples/implicit_clone/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,6 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index ff3e3d8c..c7f18a99 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("matmul.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("matmul.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
new file mode 100644
index 00000000..412e8cbb
--- /dev/null
+++ b/juno_samples/matmul/src/cpu.sch
@@ -0,0 +1,21 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+cpu(out.matmul, out.tiled_64_matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index e85dafdf..dd2dc14c 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.matmul, out.tiled_64_matmul);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs
index 2352ddef..ec111bc1 100644
--- a/juno_samples/nested_ccp/build.rs
+++ b/juno_samples/nested_ccp/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("nested_ccp.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("nested_ccp.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/nested_ccp/src/cpu.sch
new file mode 100644
index 00000000..ebf9d8fe
--- /dev/null
+++ b/juno_samples/nested_ccp/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch
index 021a05e3..69e18343 100644
--- a/juno_samples/nested_ccp/src/gpu.sch
+++ b/juno_samples/nested_ccp/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.ccp_example, out.median_array, out.no_underflow);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,6 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index 412d56a4..99ef150d 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
+        let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
         let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
         #[cfg(not(feature = "cuda"))]
         {
-            let a = HerculesCPURef::from_slice(&a);
+            let a = HerculesCPURefMut::from_slice(&mut a);
             let b = HerculesCPURefMut::from_slice(&mut b);
             let mut r = runner!(ccp_example);
             let output_example = r.run(a).await;
@@ -23,7 +23,7 @@ fn main() {
         }
         #[cfg(feature = "cuda")]
         {
-            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
             let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let mut r = runner!(ccp_example);
             let output_example = r.run(a.get_ref_mut()).await;
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index a0874af7..bfd37cb5 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("simple3.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("simple3.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch
new file mode 100644
index 00000000..d933f69c
--- /dev/null
+++ b/juno_samples/simple3/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+dce(*);
+float-collections(*);
+gcm(*);
diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch
index e97627d4..d27e5831 100644
--- a/juno_samples/simple3/src/gpu.sch
+++ b/juno_samples/simple3/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.simple3);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,6 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+float-collections(*);
+gcm(*);
-- 
GitLab