diff --git a/hercules_cg/src/gpu.rs b/hercules_cg/src/gpu.rs
index ce52a20e9a0d2e3961f4ab7865ac5b5763df9ae8..a266deea40e187afbf84bb52aa91e96f569001be 100644
--- a/hercules_cg/src/gpu.rs
+++ b/hercules_cg/src/gpu.rs
@@ -1280,14 +1280,17 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                 let mut succs = self.control_subgraph.succs(id);
                 let succ1 = succs.next().unwrap();
                 let succ2 = succs.next().unwrap();
+                let succ1_is_true = self.function.nodes[succ1.idx()].try_projection(1).is_some();
+                let succ1_block_name = self.get_block_name(succ1, false);
+                let succ2_block_name = self.get_block_name(succ2, false);
                 write!(
                     w_term,
                     "\tif ({}) {{\n",
                     self.get_value(*cond, false, false)
                 )?;
-                write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ1, false))?;
+                write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ1_block_name.clone() } else { succ2_block_name.clone() })?;
                 write!(w_term, "\t}} else {{\n")?;
-                write!(w_term, "\t\tgoto {};\n", self.get_block_name(succ2, false))?;
+                write!(w_term, "\t\tgoto {};\n", if succ1_is_true { succ2_block_name } else { succ1_block_name })?;
                 write!(w_term, "\t}}\n")?;
                 1
             }
@@ -1590,7 +1593,7 @@ extern \"C\" {} {}(", ret_type.clone(), self.function.name)?;
                             size
                         } else {
                             format!(
-                                "({} + {} - 1) / {}) * {} + {}",
+                                "({} + {} - 1) / {} * {} + {}",
                                 acc, align, align, align, size
                             )
                         }
diff --git a/hercules_cg/src/rt.rs b/hercules_cg/src/rt.rs
index 3b35f73ed370984753fa8e416f3b26c24951283f..4237cc8496ab5cf33cb4702f6ed85539860c0f95 100644
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -387,7 +387,7 @@ impl<'a> RTContext<'a> {
                 {
                     write!(block, "backing_{}.byte_add(", device.name())?;
                     self.codegen_dynamic_constant(offset, block)?;
-                    write!(block, ")")?
+                    write!(block, "), ")?
                 }
                 for dc in dynamic_constants {
                     self.codegen_dynamic_constant(*dc, block)?;
diff --git a/hercules_samples/dot/src/main.rs b/hercules_samples/dot/src/main.rs
index 4e651fa8986f07dfcc086b67ea70650746d05ea1..8862c11a9273f9808f2148f1067dcf3f5953c11f 100644
--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
@@ -8,11 +8,11 @@ juno_build::juno!("dot");
 
 fn main() {
     async_std::task::block_on(async {
-        let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
-        let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
         #[cfg(not(feature = "cuda"))]
         {
+            let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
             let a = HerculesCPURef::from_slice(&a);
+            let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
             let b = HerculesCPURef::from_slice(&b);
             let mut r = runner!(dot);
             let c = r.run(8, a, b).await;
@@ -21,8 +21,10 @@ fn main() {
         }
         #[cfg(feature = "cuda")]
         {
+            let mut a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
             let a_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
             let a = a_box.get_ref();
+            let mut b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
             let b_box = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let b = b_box.get_ref();
             let mut r = runner!(dot);
diff --git a/juno_samples/antideps/build.rs b/juno_samples/antideps/build.rs
index 92b30c43b385812c02f2685ab3cdc76950dd6960..8e26127040d08c4819f4307dbf6ebae296710257 100644
--- a/juno_samples/antideps/build.rs
+++ b/juno_samples/antideps/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("antideps.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("antideps.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/antideps/src/cpu.sch b/juno_samples/antideps/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..9c2c44a8da96406d0cc42a028dcf7ce38fefecdf
--- /dev/null
+++ b/juno_samples/antideps/src/cpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/antideps/src/gpu.sch b/juno_samples/antideps/src/gpu.sch
index d3f4a6c28ccefa44b2dd474e7f54546f1a21eab6..25dba2e7ae565d0600c53ae5fff0d403a5e4f3bc 100644
--- a/juno_samples/antideps/src/gpu.sch
+++ b/juno_samples/antideps/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.simple_antideps, out.loop_antideps, out.complex_antideps1, out.complex_a
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/casts_and_intrinsics/build.rs b/juno_samples/casts_and_intrinsics/build.rs
index e43a2ac82f2dc60dc1198a9884cce617aa3d70f5..5d25fbba54843b131f944928fdec120e8c30048e 100644
--- a/juno_samples/casts_and_intrinsics/build.rs
+++ b/juno_samples/casts_and_intrinsics/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("casts_and_intrinsics.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("casts_and_intrinsics.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/casts_and_intrinsics/src/cpu.sch b/juno_samples/casts_and_intrinsics/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..9c2c44a8da96406d0cc42a028dcf7ce38fefecdf
--- /dev/null
+++ b/juno_samples/casts_and_intrinsics/src/cpu.sch
@@ -0,0 +1,20 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/casts_and_intrinsics/src/gpu.sch b/juno_samples/casts_and_intrinsics/src/gpu.sch
index b2fb3449954ca567ef3b24adebcf5610763a687e..f051ed8c4aacceb8f86404c3e7cc6e9d140b68de 100644
--- a/juno_samples/casts_and_intrinsics/src/gpu.sch
+++ b/juno_samples/casts_and_intrinsics/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.casts_and_intrinsics);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/cava/src/cava.jn b/juno_samples/cava/src/cava.jn
index f3096ec34230ad838ca803e8d8de79c176aa9bd2..ab4fbe594b227483d2835c638bdbfba5f450e7cd 100644
--- a/juno_samples/cava/src/cava.jn
+++ b/juno_samples/cava/src/cava.jn
@@ -116,7 +116,25 @@ fn denoise<row : usize, col : usize>(input : f32[CHAN, row, col]) -> f32[CHAN, r
               filter[i, j] = input[chan, r + i - 1, c + j - 1];
             }
           }
-          res[chan, r, c] = medianMatrix::<f32, 3, 3>(filter);
+
+	  let tmp : f32[9];
+	  for  r = 0 to 3 {
+	    for c = 0 to 3 {
+	      tmp[r * 3 + c] = filter[r, c];
+	    }
+	  }
+
+	  for i = 0 to 9 - 1 {
+	    for j = 0 to 9 - i - 1 {
+	      if tmp[j] > tmp[j+1] {
+		let t : f32 = tmp[j];
+		tmp[j] = tmp[j+1];
+		tmp[j+1] = t;
+	      }
+	    }
+	  }
+
+  	  res[chan, r, c] = tmp[9 / 2];
         } else {
           res[chan, r, c] = input[chan, r, c];
         }
diff --git a/juno_samples/cava/src/gpu.sch b/juno_samples/cava/src/gpu.sch
index 07f71c99302c86a9ec3dc5c813b854fe3af983b0..bb91af7271bfc218001c77d64bb7d3f7a7888fbb 100644
--- a/juno_samples/cava/src/gpu.sch
+++ b/juno_samples/cava/src/gpu.sch
@@ -2,13 +2,13 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-gpu(scale, demosaic, medianMatrix, transform, gamut, tone_map, descale);
+inline(*);
+let out = auto-outline(*);
+gpu(out.cava);
 
 ip-sroa(*);
 sroa(*);
-
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +16,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*)
+
diff --git a/juno_samples/cava/src/main.rs b/juno_samples/cava/src/main.rs
index e05808f9e0c415dd3c3af0e480af905c04e0152c..482bbf8deb0af323255b004a7e33e70202acb886 100644
--- a/juno_samples/cava/src/main.rs
+++ b/juno_samples/cava/src/main.rs
@@ -9,11 +9,15 @@ use self::cava_rust::CHAN;
 use self::image_proc::*;
 
 use hercules_rt::{runner, HerculesCPURef};
+#[cfg(feature = "cuda")]
+use hercules_rt::CUDABox;
 
 use image::ImageError;
 
 use clap::Parser;
 
+use std::mem;
+
 juno_build::juno!("cava");
 
 fn run_cava(
@@ -27,39 +31,67 @@ fn run_cava(
     coefs: &[f32],
     tonemap: &[f32],
 ) -> Box<[u8]> {
-    assert_eq!(image.len(), CHAN * rows * cols);
-    let image = HerculesCPURef::from_slice(image);
 
+    assert_eq!(image.len(), CHAN * rows * cols);
     assert_eq!(tstw.len(), CHAN * CHAN);
-    let tstw = HerculesCPURef::from_slice(tstw);
-
     assert_eq!(ctrl_pts.len(), num_ctrl_pts * CHAN);
-    let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
-
     assert_eq!(weights.len(), num_ctrl_pts * CHAN);
-    let weights = HerculesCPURef::from_slice(weights);
-
     assert_eq!(coefs.len(), 4 * CHAN);
-    let coefs = HerculesCPURef::from_slice(coefs);
-
     assert_eq!(tonemap.len(), 256 * CHAN);
-    let tonemap = HerculesCPURef::from_slice(tonemap);
-
-    let mut r = runner!(cava);
-    async_std::task::block_on(async {
-        r.run(
-            rows as u64,
-            cols as u64,
-            num_ctrl_pts as u64,
-            image,
-            tstw,
-            ctrl_pts,
-            weights,
-            coefs,
-            tonemap,
-        )
-        .await
-    }).as_slice::<u8>().to_vec().into_boxed_slice()
+
+    #[cfg(not(feature = "cuda"))]
+    {
+        let image = HerculesCPURef::from_slice(image);
+        let tstw = HerculesCPURef::from_slice(tstw);
+        let ctrl_pts = HerculesCPURef::from_slice(ctrl_pts);
+        let weights = HerculesCPURef::from_slice(weights);
+        let coefs = HerculesCPURef::from_slice(coefs);
+        let tonemap = HerculesCPURef::from_slice(tonemap);
+	    let mut r = runner!(cava);
+	    async_std::task::block_on(async {
+		r.run(
+		    rows as u64,
+		    cols as u64,
+		    num_ctrl_pts as u64,
+		    image,
+		    tstw,
+		    ctrl_pts,
+		    weights,
+		    coefs,
+		    tonemap,
+		)
+		.await
+	    }).as_slice::<u8>().to_vec().into_boxed_slice()
+    }
+
+    #[cfg(feature = "cuda")]
+    {
+        let image = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(image));
+        let tstw = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tstw));
+        let ctrl_pts = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(ctrl_pts));
+        let weights = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(weights));
+        let coefs = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(coefs));
+        let tonemap = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(tonemap));
+	    let mut r = runner!(cava);
+	    let res = async_std::task::block_on(async {
+            r.run(
+                rows as u64,
+                cols as u64,
+                num_ctrl_pts as u64,
+                image.get_ref(),
+                tstw.get_ref(),
+                ctrl_pts.get_ref(),
+                weights.get_ref(),
+                coefs.get_ref(),
+                tonemap.get_ref(),
+            )
+            .await
+	    });
+        let num_out = unsafe { res.__size() / std::mem::size_of::<u8>() };
+        let mut res_cpu: Box<[u8]> = vec![0; num_out].into_boxed_slice();
+        res.to_cpu_ref(&mut res_cpu);
+        res_cpu
+    }
 }
 
 enum Error {
diff --git a/juno_samples/concat/src/concat.jn b/juno_samples/concat/src/concat.jn
index b9806c9348e73a3467f29f2f39c981b8f347037c..d901e7e17c527edcbcb01929b14adfcb37e6c642 100644
--- a/juno_samples/concat/src/concat.jn
+++ b/juno_samples/concat/src/concat.jn
@@ -18,7 +18,7 @@ fn sum<t : number, c : usize>(arr : t[c]) -> t {
 }
 
 #[entry]
-fn concat_entry(arr1 : i32[3], arr2 : i32[6]) -> i32 {
-  let arr3 = concat::<i32, 3, 6>(arr1, arr2);
-  return sum::<i32, 9>(arr3);
+fn concat_entry<a : usize, b: usize>(arr1 : i32[a], arr2 : i32[b]) -> i32 {
+  let arr3 = concat::<i32, a, b>(arr1, arr2);
+  return sum::<i32, a + b>(arr3);
 }
diff --git a/juno_samples/concat/src/cpu.sch b/juno_samples/concat/src/cpu.sch
index 680adaeb2070b26b68fba361aafb64e5af2afc03..7b87070aa8624cd7aeb01ffa610e7654eb06a42e 100644
--- a/juno_samples/concat/src/cpu.sch
+++ b/juno_samples/concat/src/cpu.sch
@@ -2,12 +2,12 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-cpu(concat, sum);
+inline(*);
+auto-outline(*);
 
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -15,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/concat/src/gpu.sch b/juno_samples/concat/src/gpu.sch
index 8ee4ef0e5627aa6a51cb5e86dccd014f18dcfd96..71bed4b4a8913d4edbeadbc58b2ce4ed2492ebc6 100644
--- a/juno_samples/concat/src/gpu.sch
+++ b/juno_samples/concat/src/gpu.sch
@@ -2,12 +2,13 @@ gvn(*);
 phi-elim(*);
 dce(*);
 
-gpu(concat, sum);
+inline(*);
+let out = auto-outline(*);
+gpu(out.concat_entry);
 
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -15,3 +16,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/concat/src/main.rs b/juno_samples/concat/src/main.rs
index d0929fbf0a3d5d5073b51f79ee8ab575208eb7cc..78932421df9890fbebfa4e136b4f85f875166130 100644
--- a/juno_samples/concat/src/main.rs
+++ b/juno_samples/concat/src/main.rs
@@ -10,14 +10,13 @@ juno_build::juno!("concat");
 fn main() {
     async_std::task::block_on(async {
         let mut r = runner!(concat_entry);
+        let mut a_data = [7, 7, 0];
+        let mut b_data = [7, 7, 0, 0, 7, 7];
         #[cfg(not(feature = "cuda"))]
         {
-            let mut a_data = [7, 7, 0];
             let a = HerculesCPURef::from_slice(&mut a_data);
-            let mut b_data = [7, 7, 0, 0, 7, 7];
             let b = HerculesCPURef::from_slice(&mut b_data);
-            let output = r.run(a, b).await;
-            println!("{}", output);
+            let output = r.run(3, 6, a, b).await;
             assert_eq!(output, 42);
         }
         #[cfg(feature = "cuda")]
@@ -26,8 +25,7 @@ fn main() {
             let a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a_data));
             let mut b_data = [7, 7, 0, 0, 7, 7];
             let b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b_data));
-            let output = r.run(a.get_ref(), b.get_ref()).await;
-            println!("{}", output);
+            let output = r.run(3, 6, a.get_ref(), b.get_ref()).await;
             assert_eq!(output, 42);
         }
     });
diff --git a/juno_samples/implicit_clone/build.rs b/juno_samples/implicit_clone/build.rs
index dc134e59757a301b2be13ab61ed4d2feb252d1fe..a464568d08ffbdb7426b66b044173f981bf42d8e 100644
--- a/juno_samples/implicit_clone/build.rs
+++ b/juno_samples/implicit_clone/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("implicit_clone.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("implicit_clone.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/implicit_clone/src/cpu.sch b/juno_samples/implicit_clone/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..ebf9d8fee2514fe72e91979cf7ef3f9d90ac91e3
--- /dev/null
+++ b/juno_samples/implicit_clone/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/implicit_clone/src/gpu.sch b/juno_samples/implicit_clone/src/gpu.sch
index 443fc778fe6b6f4489d6c5a339b691f488f917c2..0f7c80213d20481ce51628736f0ed07be21da399 100644
--- a/juno_samples/implicit_clone/src/gpu.sch
+++ b/juno_samples/implicit_clone/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.simple_implicit_clone, out.loop_implicit_clone, out.double_loop_implicit
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,6 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/matmul/build.rs b/juno_samples/matmul/build.rs
index ff3e3d8c79091349f9d76716f3d7496bba3d5503..c7f18a99c47239eab72e207f7c1cb8195f45a9f6 100644
--- a/juno_samples/matmul/build.rs
+++ b/juno_samples/matmul/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("matmul.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("matmul.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/matmul/src/cpu.sch b/juno_samples/matmul/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..412e8cbb31c3affbcc1cdd6c5a3c085a93804d74
--- /dev/null
+++ b/juno_samples/matmul/src/cpu.sch
@@ -0,0 +1,21 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+let out = auto-outline(*);
+cpu(out.matmul, out.tiled_64_matmul);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/matmul/src/gpu.sch b/juno_samples/matmul/src/gpu.sch
index e85dafdfdbfac2021b0215f625fb8077972be6b4..dd2dc14c064b8e4e9b1ed639507972ec6d507a84 100644
--- a/juno_samples/matmul/src/gpu.sch
+++ b/juno_samples/matmul/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.matmul, out.tiled_64_matmul);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,7 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
+
diff --git a/juno_samples/nested_ccp/build.rs b/juno_samples/nested_ccp/build.rs
index 2352ddef0a432d42a09fef72042c516843799290..ec111bc1b9d57eda7416d89d7df06caeb60bc258 100644
--- a/juno_samples/nested_ccp/build.rs
+++ b/juno_samples/nested_ccp/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("nested_ccp.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("nested_ccp.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/nested_ccp/src/cpu.sch b/juno_samples/nested_ccp/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..ebf9d8fee2514fe72e91979cf7ef3f9d90ac91e3
--- /dev/null
+++ b/juno_samples/nested_ccp/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/nested_ccp/src/gpu.sch b/juno_samples/nested_ccp/src/gpu.sch
index 021a05e3f361e4f5d1ae679c33f24282d431b2cc..69e18343665142516174a7a04ed8e8f5fd536d74 100644
--- a/juno_samples/nested_ccp/src/gpu.sch
+++ b/juno_samples/nested_ccp/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.ccp_example, out.median_array, out.no_underflow);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,6 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+float-collections(*);
+dce(*);
+gcm(*);
diff --git a/juno_samples/nested_ccp/src/main.rs b/juno_samples/nested_ccp/src/main.rs
index 412d56a4f4e66af5a6608822219bce16ac5554dc..99ef150d942d256c4475344822efcfb0cb6f693a 100644
--- a/juno_samples/nested_ccp/src/main.rs
+++ b/juno_samples/nested_ccp/src/main.rs
@@ -8,11 +8,11 @@ juno_build::juno!("nested_ccp");
 
 fn main() {
     async_std::task::block_on(async {
-        let a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
+        let mut a: Box<[f32]> = Box::new([17.0, 18.0, 19.0]);
         let mut b: Box<[i32]> = Box::new([12, 16, 4, 18, 23, 56, 93, 22, 14]);
         #[cfg(not(feature = "cuda"))]
         {
-            let a = HerculesCPURef::from_slice(&a);
+            let a = HerculesCPURefMut::from_slice(&mut a);
             let b = HerculesCPURefMut::from_slice(&mut b);
             let mut r = runner!(ccp_example);
             let output_example = r.run(a).await;
@@ -23,7 +23,7 @@ fn main() {
         }
         #[cfg(feature = "cuda")]
         {
-            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&a));
+            let mut a = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut a));
             let mut b = CUDABox::from_cpu_ref(HerculesCPURef::from_slice(&mut b));
             let mut r = runner!(ccp_example);
             let output_example = r.run(a.get_ref_mut()).await;
diff --git a/juno_samples/simple3/build.rs b/juno_samples/simple3/build.rs
index a0874af7ecb42b29eea3aaaa5628d1c0ddeb2090..bfd37cb50d74bf09118b7b768600d132d7fbf9e1 100644
--- a/juno_samples/simple3/build.rs
+++ b/juno_samples/simple3/build.rs
@@ -1,17 +1,10 @@
 use juno_build::JunoCompiler;
 
 fn main() {
-    #[cfg(feature = "cuda")]
     JunoCompiler::new()
         .file_in_src("simple3.jn")
         .unwrap()
-        .schedule_in_src("gpu.sch")
-        .unwrap()
-        .build()
-        .unwrap();
-    #[cfg(not(feature = "cuda"))]
-    JunoCompiler::new()
-        .file_in_src("simple3.jn")
+        .schedule_in_src(if cfg!(feature = "cuda") { "gpu.sch" } else { "cpu.sch" })
         .unwrap()
         .build()
         .unwrap();
diff --git a/juno_samples/simple3/src/cpu.sch b/juno_samples/simple3/src/cpu.sch
new file mode 100644
index 0000000000000000000000000000000000000000..d933f69c025454f094f67b03711da4a7d23e4d18
--- /dev/null
+++ b/juno_samples/simple3/src/cpu.sch
@@ -0,0 +1,19 @@
+gvn(*);
+phi-elim(*);
+dce(*);
+
+auto-outline(*);
+
+ip-sroa(*);
+sroa(*);
+dce(*);
+gvn(*);
+phi-elim(*);
+dce(*);
+
+infer-schedules(*);
+
+gcm(*);
+dce(*);
+float-collections(*);
+gcm(*);
diff --git a/juno_samples/simple3/src/gpu.sch b/juno_samples/simple3/src/gpu.sch
index e97627d43c0e1dccf67f565c686712acb1f080f8..d27e58311603dc6186a3b7a474117fd76766ca09 100644
--- a/juno_samples/simple3/src/gpu.sch
+++ b/juno_samples/simple3/src/gpu.sch
@@ -8,7 +8,6 @@ gpu(out.simple3);
 ip-sroa(*);
 sroa(*);
 dce(*);
-float-collections(*);
 gvn(*);
 phi-elim(*);
 dce(*);
@@ -16,3 +15,6 @@ dce(*);
 infer-schedules(*);
 
 gcm(*);
+dce(*);
+float-collections(*);
+gcm(*);