Compare revisions

Aaron Councilman · Aaron Councilman · rarbore2 · rarbore2 · rarbore2 · rarbore2
--- a/hercules_cg/src/device.rs
+++ b/hercules_cg/src/device.rs
-use hercules_ir::*;
-
-/*
- * Top level function to definitively place functions onto devices. A function
- * may store a device placement, but only optionally - this function assigns
- * devices to the rest of the functions.
- */
-pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec<Device> {
-    let mut devices = vec![];
-
-    for (idx, function) in functions.into_iter().enumerate() {
-        if let Some(device) = function.device {
-            devices.push(device);
-        } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 {
-            devices.push(Device::AsyncRust);
-        } else {
-            devices.push(Device::LLVM);
-        }
-    }
-
-    devices
-}
--- a/hercules_cg/src/lib.rs
+++ b/hercules_cg/src/lib.rs
 #![feature(if_let_guard, let_chains)]

 pub mod cpu;
-pub mod device;
 pub mod rt;

 pub use crate::cpu::*;
-pub use crate::device::*;
 pub use crate::rt::*;

+use std::collections::BTreeMap;
+
 use hercules_ir::*;

+pub const LARGEST_ALIGNMENT: usize = 8;
+
 /*
 * The alignment of a type does not depend on dynamic constants.
 */
@@ -28,3 +30,20 @@ pub fn get_type_alignment(types: &Vec<Type>, ty: TypeID) -> usize {
        Type::Array(elem, _) => get_type_alignment(types, elem),
    }
 }
+
+/*
+ * Nodes producing collection values are "colored" with what device their
+ * underlying memory lives on.
+ */
+pub type FunctionNodeColors = BTreeMap<NodeID, Device>;
+pub type NodeColors = Vec<FunctionNodeColors>;
+
+/*
+ * The allocation information of each function is a size of the backing memory
+ * needed and offsets into that backing memory per constant object and call node
+ * in the function.
+ */
+pub type FunctionBackingAllocation =
+    BTreeMap<Device, (DynamicConstantID, BTreeMap<NodeID, DynamicConstantID>)>;
+pub type BackingAllocations = BTreeMap<FunctionID, FunctionBackingAllocation>;
+pub const BACKED_DEVICES: [Device; 2] = [Device::LLVM, Device::CUDA];
--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
--- a/hercules_ir/src/device.rs
+++ b/hercules_ir/src/device.rs
+use std::collections::BTreeSet;
+use std::mem::take;
+
+use crate::*;
+
+/*
+ * Top level function to definitively place functions onto devices. A function
+ * may store a device placement, but only optionally - this function assigns
+ * devices to the rest of the functions.
+ */
+pub fn device_placement(functions: &Vec<Function>, callgraph: &CallGraph) -> Vec<Device> {
+    let mut devices = vec![];
+
+    for (idx, function) in functions.into_iter().enumerate() {
+        if let Some(device) = function.device {
+            devices.push(device);
+        } else if function.entry || callgraph.num_callees(FunctionID::new(idx)) != 0 {
+            devices.push(Device::AsyncRust);
+        } else {
+            devices.push(Device::LLVM);
+        }
+    }
+
+    devices
+}
+
+pub type FunctionObjectDeviceDemands = Vec<BTreeSet<Device>>;
+pub type ObjectDeviceDemands = Vec<FunctionObjectDeviceDemands>;
+
+/*
+ * This analysis figures out which device each collection object may be on. At
+ * first, an object may need to be on different devices at different times. This
+ * is fine during optimization.
+ */
+pub fn object_device_demands(
+    functions: &Vec<Function>,
+    types: &Vec<Type>,
+    typing: &ModuleTyping,
+    callgraph: &CallGraph,
+    objects: &CollectionObjects,
+    devices: &Vec<Device>,
+) -> ObjectDeviceDemands {
+    // An object is "demanded" on a device when:
+    // 1. The object is used by a primitive read node or write node in a device
+    //    function. This includes objects on the `data` input to write nodes.
+    //    Non-primitive reads don't demand an object on a device since they are
+    //    lowered to pointer math and no actual memory transfers.
+    // 2. The object is passed as input to a call node where the corresponding
+    //    object in the callee is demanded on a device.
+    // 3. The object is returned from a call node where the corresponding object
+    //    in the callee is demanded on a device.
+    // Note that reads and writes in a RT function don't induce a device demand.
+    // This is because RT functions can  call device functions as necessary to
+    // arbitrarily move data onto / off of devices (though this may be slow).
+    // Traverse the functions in a module in reverse topological order, since
+    // the analysis of a function depends on all functions it calls.
+    let mut demands: ObjectDeviceDemands = vec![vec![]; functions.len()];
+    let topo = callgraph.topo();
+
+    for func_id in topo {
+        let function = &functions[func_id.idx()];
+        let typing = &typing[func_id.idx()];
+        let device = devices[func_id.idx()];
+
+        demands[func_id.idx()].resize(objects[&func_id].num_objects(), BTreeSet::new());
+        match device {
+            Device::LLVM | Device::CUDA => {
+                for (idx, node) in function.nodes.iter().enumerate() {
+                    // Condition #1.
+                    match node {
+                        Node::Read {
+                            collect,
+                            indices: _,
+                        } if types[typing[idx].idx()].is_primitive() => {
+                            for object in objects[&func_id].objects(*collect) {
+                                demands[func_id.idx()][object.idx()].insert(device);
+                            }
+                        }
+                        Node::Write {
+                            collect,
+                            data,
+                            indices: _,
+                        } => {
+                            for object in objects[&func_id]
+                                .objects(*collect)
+                                .into_iter()
+                                .chain(objects[&func_id].objects(*data).into_iter())
+                            {
+                                demands[func_id.idx()][object.idx()].insert(device);
+                            }
+                        }
+                        _ => {}
+                    }
+                }
+            }
+            Device::AsyncRust => {
+                for (idx, node) in function.nodes.iter().enumerate() {
+                    if let Node::Call {
+                        control: _,
+                        function: callee,
+                        dynamic_constants: _,
+                        args,
+                    } = node
+                    {
+                        // Condition #2.
+                        for (param_idx, arg) in args.into_iter().enumerate() {
+                            if let Some(callee_obj) = objects[callee].param_to_object(param_idx) {
+                                let callee_demands =
+                                    take(&mut demands[callee.idx()][callee_obj.idx()]);
+                                for object in objects[&func_id].objects(*arg) {
+                                    demands[func_id.idx()][object.idx()]
+                                        .extend(callee_demands.iter());
+                                }
+                                demands[callee.idx()][callee_obj.idx()] = callee_demands;
+                            }
+                        }
+
+                        // Condition #3.
+                        for callee_obj in objects[callee].returned_objects() {
+                            let callee_demands = take(&mut demands[callee.idx()][callee_obj.idx()]);
+                            for object in objects[&func_id].objects(NodeID::new(idx)) {
+                                demands[func_id.idx()][object.idx()].extend(callee_demands.iter());
+                            }
+                            demands[callee.idx()][callee_obj.idx()] = callee_demands;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    demands
+}
--- a/hercules_ir/src/dom.rs
+++ b/hercules_ir/src/dom.rs
 use std::collections::HashMap;

+use bitvec::prelude::*;
+
 use crate::*;

 /*
@@ -304,3 +306,55 @@ pub fn postdominator(subgraph: Subgraph, fake_root: NodeID) -> DomTree {
    // root as the root of the dominator analysis.
    dominator(&reversed_subgraph, fake_root)
 }
+
+/*
+ * Check if a data node dominates a control node. This involves checking all
+ * immediate control uses to see if they dominate the queried control node.
+ */
+pub fn does_data_dom_control(
+    function: &Function,
+    data: NodeID,
+    control: NodeID,
+    dom: &DomTree,
+) -> bool {
+    let mut stack = vec![data];
+    let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()];
+    visited.set(data.idx(), true);
+
+    while let Some(pop) = stack.pop() {
+        let node = &function.nodes[pop.idx()];
+
+        let imm_control = match node {
+            Node::Phi { control, data: _ }
+            | Node::Reduce {
+                control,
+                init: _,
+                reduct: _,
+            }
+            | Node::Call {
+                control,
+                function: _,
+                dynamic_constants: _,
+                args: _,
+            } => Some(*control),
+            _ if node.is_control() => Some(pop),
+            _ => {
+                for u in get_uses(node).as_ref() {
+                    if !visited[u.idx()] {
+                        visited.set(u.idx(), true);
+                        stack.push(*u);
+                    }
+                }
+                None
+            }
+        };
+
+        if let Some(imm_control) = imm_control
+            && !dom.does_dom(imm_control, control)
+        {
+            return false;
+        }
+    }
+
+    true
+}
--- a/hercules_ir/src/dot.rs
+++ b/hercules_ir/src/dot.rs
@@ -18,6 +18,7 @@ pub fn xdot_module(
    reverse_postorders: &Vec<Vec<NodeID>>,
    doms: Option<&Vec<DomTree>>,
    fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>,
+    devices: Option<&Vec<Device>>,
    bbs: Option<&Vec<BasicBlocks>>,
 ) {
    let mut tmp_path = temp_dir();
@@ -31,6 +32,7 @@ pub fn xdot_module(
        &reverse_postorders,
        doms,
        fork_join_maps,
+        devices,
        bbs,
        &mut contents,
    )
@@ -53,6 +55,7 @@ pub fn write_dot<W: Write>(
    reverse_postorders: &Vec<Vec<NodeID>>,
    doms: Option<&Vec<DomTree>>,
    fork_join_maps: Option<&Vec<HashMap<NodeID, NodeID>>>,
+    devices: Option<&Vec<Device>>,
    bbs: Option<&Vec<BasicBlocks>>,
    w: &mut W,
 ) -> std::fmt::Result {
@@ -65,7 +68,12 @@ pub fn write_dot<W: Write>(
        for (idx, id) in reverse_postorder.iter().enumerate() {
            reverse_postorder_node_numbers[id.idx()] = idx;
        }
-        write_subgraph_header(function_id, module, w)?;
+        write_subgraph_header(
+            function_id,
+            module,
+            devices.map(|devices| devices[function_id.idx()]),
+            w,
+        )?;

        // Step 1: draw IR graph itself. This includes all IR nodes and all edges
        // between IR nodes.
@@ -168,7 +176,7 @@ pub fn write_dot<W: Write>(
            }
        }

-        // Step 4: draw basic block edges in indigo.
+        // Step 4: draw basic block edges in blue.
        if let Some(bbs) = bbs {
            let bbs = &bbs[function_id.idx()].0;
            for (idx, bb) in bbs.into_iter().enumerate() {
@@ -179,7 +187,7 @@ pub fn write_dot<W: Write>(
                        *bb,
                        function_id,
                        true,
-                        "indigo",
+                        "lightslateblue",
                        "dotted",
                        &module,
                        w,
@@ -204,6 +212,7 @@ fn write_digraph_header<W: Write>(w: &mut W) -> std::fmt::Result {
 fn write_subgraph_header<W: Write>(
    function_id: FunctionID,
    module: &Module,
+    device: Option<Device>,
    w: &mut W,
 ) -> std::fmt::Result {
    let function = &module.functions[function_id.idx()];
@@ -219,8 +228,8 @@ fn write_subgraph_header<W: Write>(
    } else {
        write!(w, "label=\"{}\"\n", function.name)?;
    }
-    let color = match function.device {
-        Some(Device::LLVM) => "paleturquoise1",
+    let color = match device.or(function.device) {
+        Some(Device::LLVM) => "slategray1",
        Some(Device::CUDA) => "darkseagreen1",
        Some(Device::AsyncRust) => "peachpuff1",
        None => "ivory2",

--- a/hercules_ir/src/fork_join_analysis.rs
+++ b/hercules_ir/src/fork_join_analysis.rs
 use std::collections::{HashMap, HashSet};

-use bitvec::prelude::*;
-
 use crate::*;

 /*
@@ -75,55 +73,3 @@ pub fn compute_fork_join_nesting(
        })
        .collect()
 }
-
-/*
- * Check if a data node dominates a control node. This involves checking all
- * immediate control uses to see if they dominate the queried control node.
- */
-pub fn does_data_dom_control(
-    function: &Function,
-    data: NodeID,
-    control: NodeID,
-    dom: &DomTree,
-) -> bool {
-    let mut stack = vec![data];
-    let mut visited = bitvec![u8, Lsb0; 0; function.nodes.len()];
-    visited.set(data.idx(), true);
-
-    while let Some(pop) = stack.pop() {
-        let node = &function.nodes[pop.idx()];
-
-        let imm_control = match node {
-            Node::Phi { control, data: _ }
-            | Node::Reduce {
-                control,
-                init: _,
-                reduct: _,
-            }
-            | Node::Call {
-                control,
-                function: _,
-                dynamic_constants: _,
-                args: _,
-            } => Some(*control),
-            _ if node.is_control() => Some(pop),
-            _ => {
-                for u in get_uses(node).as_ref() {
-                    if !visited[u.idx()] {
-                        visited.set(u.idx(), true);
-                        stack.push(*u);
-                    }
-                }
-                None
-            }
-        };
-
-        if let Some(imm_control) = imm_control
-            && !dom.does_dom(imm_control, control)
-        {
-            return false;
-        }
-    }
-
-    true
-}
--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
@@ -332,7 +332,7 @@ pub enum Schedule {
 * The authoritative enumeration of supported backends. Multiple backends may
 * correspond to the same kind of hardware.
 */
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
 pub enum Device {
    LLVM,
    CUDA,
@@ -1710,6 +1710,16 @@ impl Intrinsic {
    }
 }

+impl Device {
+    pub fn name(&self) -> &'static str {
+        match self {
+            Device::LLVM => "cpu",
+            Device::CUDA => "cuda",
+            Device::AsyncRust => "rt",
+        }
+    }
+}
+
 /*
 * Rust things to make newtyped IDs usable.
 */

--- a/hercules_ir/src/lib.rs
+++ b/hercules_ir/src/lib.rs
@@ -11,6 +11,7 @@ pub mod callgraph;
 pub mod collections;
 pub mod dataflow;
 pub mod def_use;
+pub mod device;
 pub mod dom;
 pub mod dot;
 pub mod fork_join_analysis;
@@ -26,6 +27,7 @@ pub use crate::callgraph::*;
 pub use crate::collections::*;
 pub use crate::dataflow::*;
 pub use crate::def_use::*;
+pub use crate::device::*;
 pub use crate::dom::*;
 pub use crate::dot::*;
 pub use crate::fork_join_analysis::*;

--- a/hercules_ir/src/loops.rs
+++ b/hercules_ir/src/loops.rs
@@ -233,14 +233,21 @@ fn loop_reachability_helper(
 pub fn reduce_cycles(
    function: &Function,
    def_use: &ImmutableDefUseMap,
+    fork_join_map: &HashMap<NodeID, NodeID>,
+    fork_join_nest: &HashMap<NodeID, Vec<NodeID>>,
 ) -> HashMap<NodeID, HashSet<NodeID>> {
    let reduces = (0..function.nodes.len())
        .filter(|idx| function.nodes[*idx].is_reduce())
        .map(NodeID::new);
    let mut result = HashMap::new();
+    let join_fork_map: HashMap<NodeID, NodeID> = fork_join_map
+        .into_iter()
+        .map(|(fork, join)| (*join, *fork))
+        .collect();

    for reduce in reduces {
-        let (_, _, reduct) = function.nodes[reduce.idx()].try_reduce().unwrap();
+        let (join, _, reduct) = function.nodes[reduce.idx()].try_reduce().unwrap();
+        let fork = join_fork_map[&join];

        // First, find all data nodes that are used by the `reduct` input of the
        // reduce, including the `reduct` itself.
@@ -249,7 +256,13 @@ pub fn reduce_cycles(
        let mut worklist = vec![reduct];
        while let Some(item) = worklist.pop() {
            for u in get_uses(&function.nodes[item.idx()]).as_ref() {
-                if !function.nodes[u.idx()].is_control() && !use_reachable.contains(u) {
+                if !function.nodes[u.idx()].is_control()
+                    && !use_reachable.contains(u)
+                    && function.nodes[u.idx()]
+                        .try_phi()
+                        .map(|(control, _)| fork_join_nest[&fork].contains(&control))
+                        .unwrap_or(true)
+                {
                    use_reachable.insert(*u);
                    worklist.push(*u);
                }

--- a/hercules_opt/src/device_placement.rs
+++ b/hercules_opt/src/device_placement.rs
+use hercules_ir::ir::*;
+
+use crate::*;
--- a/hercules_opt/src/editor.rs
+++ b/hercules_opt/src/editor.rs
@@ -335,6 +335,14 @@ impl<'a: 'b, 'b> FunctionEditor<'a> {
        self.function_id
    }

+    pub fn get_types(&self) -> Ref<'_, Vec<Type>> {
+        self.types.borrow()
+    }
+
+    pub fn get_constants(&self) -> Ref<'_, Vec<Constant>> {
+        self.constants.borrow()
+    }
+
    pub fn get_dynamic_constants(&self) -> Ref<'_, Vec<DynamicConstant>> {
        self.dynamic_constants.borrow()
    }

--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
+use std::cell::Ref;
 use std::collections::{BTreeMap, BTreeSet, HashMap, VecDeque};
 use std::iter::{empty, once, zip, FromIterator};

@@ -5,6 +6,7 @@ use bitvec::prelude::*;
 use either::Either;
 use union_find::{QuickFindUf, UnionBySize, UnionFind};

+use hercules_cg::*;
 use hercules_ir::*;

 use crate::*;
@@ -35,8 +37,35 @@ use crate::*;
 * liveness analysis result, so every spill restarts the process of checking for
 * spills. Once no more spills are found, the process terminates. When a spill
 * is found, the basic block assignments, and all the other analyses, are not
- * necessarily valid anymore, so this function is called in a loop in pass.rs
- * until no more spills are found.
+ * necessarily valid anymore, so this function is called in a loop in the pass
+ * manager until no more spills are found.
+ *
+ * GCM is additionally complicated by the need to generate code that references
+ * objects across multiple devices. In particular, GCM makes sure that every
+ * object lives on exactly one device, so that references to that object always
+ * live on a single device. Additionally, GCM makes sure that the objects that a
+ * node may produce are all on the same device, so that a pointer produced by,
+ * for example, a select node can only refer to memory on a single device. Extra
+ * collection constants and potentially inter-device copies are inserted as
+ * necessary to make sure this is true - an inter-device copy is represented by
+ * a write where the `collect` and `data` inputs are on different devices. This
+ * is only valid in RT functions - it is asserted that this isn't necessary in
+ * device functions. This process "colors" the nodes in the function.
+ *
+ * GCM has one final responsibility - object allocation. Each Hercules function
+ * receives a pointer to a "backing" memory where collection constants live. The
+ * backing memory a function receives is for the constants in that function and
+ * the constants of every called function. Concretely, a function will pass a
+ * sub-regions of its backing memory to a callee, which during the call is that
+ * function's backing memory. Object allocation consists of finding the required
+ * sizes of all collection constants and functions in terms of dynamic constants
+ * (dynamic constant math is expressive enough to represent sizes of types,
+ * which is very convenient) and determining the concrete offsets into the
+ * backing memory where constants and callee sub-regions live. When two users of
+ * backing memory are never live at once, they may share backing memory. This is
+ * done after nodes are given a single device color, since we need to know what
+ * values are on what devices before we can allocate them to backing memory,
+ * since there are separate backing memories per-device.
 */
 pub fn gcm(
    editor: &mut FunctionEditor,
@@ -48,7 +77,10 @@ pub fn gcm(
    fork_join_map: &HashMap<NodeID, NodeID>,
    loops: &LoopTree,
    objects: &CollectionObjects,
-) -> Option<BasicBlocks> {
+    devices: &Vec<Device>,
+    object_device_demands: &FunctionObjectDeviceDemands,
+    backing_allocations: &BackingAllocations,
+) -> Option<(BasicBlocks, FunctionNodeColors, FunctionBackingAllocation)> {
    let bbs = basic_blocks(
        editor.func(),
        editor.func_id(),
@@ -59,11 +91,69 @@ pub fn gcm(
        fork_join_map,
        objects,
    );
-    if spill_clones(editor, typing, control_subgraph, objects, &bbs) {
-        None
-    } else {
-        Some(bbs)
+
+    let liveness = liveness_dataflow(
+        editor.func(),
+        editor.func_id(),
+        control_subgraph,
+        objects,
+        &bbs,
+    );
+
+    if spill_clones(editor, typing, control_subgraph, objects, &bbs, &liveness) {
+        return None;
    }
+
+    let func_id = editor.func_id();
+    let Some(node_colors) = color_nodes(
+        editor,
+        reverse_postorder,
+        &objects[&func_id],
+        &object_device_demands,
+    ) else {
+        return None;
+    };
+
+    let device = devices[func_id.idx()];
+    match device {
+        Device::LLVM | Device::CUDA => {
+            // Check that every object that has a demand in this function are
+            // only demanded on this device.
+            for demands in object_device_demands {
+                assert!(demands.is_empty() || (demands.len() == 1 && demands.contains(&device)))
+            }
+        }
+        Device::AsyncRust => {
+            // Check that every object that has a demand in this function only
+            // has a demand from one device.
+            for demands in object_device_demands {
+                assert!(demands.len() <= 1);
+            }
+        }
+    }
+
+    let mut alignments = vec![];
+    Ref::map(editor.get_types(), |types| {
+        for idx in 0..types.len() {
+            if types[idx].is_control() {
+                alignments.push(0);
+            } else {
+                alignments.push(get_type_alignment(types, TypeID::new(idx)));
+            }
+        }
+        &()
+    });
+
+    let backing_allocation = object_allocation(
+        editor,
+        typing,
+        &node_colors,
+        &alignments,
+        &liveness,
+        backing_allocations,
+    );
+
+    Some((bbs, node_colors, backing_allocation))
 }

 /*
@@ -109,8 +199,6 @@ fn basic_blocks(
                args: _,
            } => bbs[idx] = Some(control),
            Node::Parameter { index: _ } => bbs[idx] = Some(NodeID::new(0)),
-            Node::Constant { id: _ } => bbs[idx] = Some(NodeID::new(0)),
-            Node::DynamicConstant { id: _ } => bbs[idx] = Some(NodeID::new(0)),
            _ if function.nodes[idx].is_control() => bbs[idx] = Some(NodeID::new(idx)),
            _ => {}
        }
@@ -580,8 +668,6 @@ fn mutating_writes<'a>(
    }
 }

-type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>;
-
 /*
 * Top level function to find implicit clones that need to be spilled. Returns
 * whether a clone was spilled, in which case the whole scheduling process must
@@ -593,19 +679,9 @@ fn spill_clones(
    control_subgraph: &Subgraph,
    objects: &CollectionObjects,
    bbs: &BasicBlocks,
+    liveness: &Liveness,
 ) -> bool {
-    // Step 1: compute a liveness analysis of collection values in the IR. This
-    // requires a dataflow analysis over the scheduled IR, which is not a common
-    // need in Hercules, so just hardcode the analysis.
-    let liveness = liveness_dataflow(
-        editor.func(),
-        editor.func_id(),
-        control_subgraph,
-        objects,
-        bbs,
-    );
-
-    // Step 2: compute an interference graph from the liveness result. This
+    // Step 1: compute an interference graph from the liveness result. This
    // graph contains a vertex per node ID producing a collection value and an
    // edge per pair of node IDs that interfere. Nodes A and B interfere if node
    // A is defined right above a point where node B is live and A != B. Extra
@@ -652,7 +728,7 @@ fn spill_clones(
        }
    }

-    // Step 3: filter edges (A, B) to just see edges where A uses B and A
+    // Step 2: filter edges (A, B) to just see edges where A uses B and A
    // mutates B. These are the edges that may require a spill.
    let mut spill_edges = edges.into_iter().filter(|(a, b)| {
        mutating_writes(editor.func(), *a, objects).any(|id| id == *b)
@@ -664,7 +740,7 @@ fn spill_clones(
                    || editor.func().nodes[a.idx()].is_reduce()))
    });

-    // Step 4: if there is a spill edge, spill it and return true. Otherwise,
+    // Step 3: if there is a spill edge, spill it and return true. Otherwise,
    // return false.
    if let Some((user, obj)) = spill_edges.next() {
        // Figure out the most immediate dominating region for every basic
@@ -818,6 +894,8 @@ fn spill_clones(
    }
 }

+type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>;
+
 /*
 * Liveness dataflow analysis on scheduled Hercules IR. Just look at nodes that
 * involve collections.
@@ -938,3 +1016,179 @@ fn liveness_dataflow(
        }
    }
 }
+
+/*
+ * Determine what device each node produces a collection onto. Insert inter-
+ * device clones when a single node may potentially be on different devices.
+ */
+fn color_nodes(
+    editor: &mut FunctionEditor,
+    reverse_postorder: &Vec<NodeID>,
+    objects: &FunctionCollectionObjects,
+    object_device_demands: &FunctionObjectDeviceDemands,
+) -> Option<FunctionNodeColors> {
+    // First, try to give each node a single color.
+    let mut colors = BTreeMap::new();
+    let mut bad_node = None;
+    'nodes: for id in reverse_postorder {
+        let mut device = None;
+        for object in objects.objects(*id) {
+            for demand in object_device_demands[object.idx()].iter() {
+                if let Some(device) = device
+                    && device != *demand
+                {
+                    bad_node = Some(id);
+                    break 'nodes;
+                }
+                device = Some(*demand);
+            }
+        }
+        if let Some(device) = device {
+            colors.insert(*id, device);
+        } else {
+            assert!(objects.objects(*id).is_empty(), "PANIC: Found an object with no device demands. This is technically possible and is easily supported by just picking an arbitrary device for this object. This assert exists because I'm curious to see where this will be needed first, and if that use is frivolous or not.");
+        }
+    }
+    if bad_node.is_some() {
+        todo!("Deal with inter-device demands.")
+    }
+
+    Some(colors)
+}
+
+fn align(edit: &mut FunctionEdit, mut acc: DynamicConstantID, align: usize) -> DynamicConstantID {
+    assert_ne!(align, 0);
+    if align != 1 {
+        let align_dc = edit.add_dynamic_constant(DynamicConstant::Constant(align));
+        let align_m1_dc = edit.add_dynamic_constant(DynamicConstant::Constant(align - 1));
+        acc = edit.add_dynamic_constant(DynamicConstant::Add(acc, align_m1_dc));
+        acc = edit.add_dynamic_constant(DynamicConstant::Div(acc, align_dc));
+        acc = edit.add_dynamic_constant(DynamicConstant::Mul(acc, align_dc));
+    }
+    acc
+}
+
+/*
+ * Determine the size of a type in terms of dynamic constants.
+ */
+fn type_size(edit: &mut FunctionEdit, ty_id: TypeID, alignments: &Vec<usize>) -> DynamicConstantID {
+    let ty = edit.get_type(ty_id).clone();
+    let size = match ty {
+        Type::Control => panic!(),
+        Type::Boolean | Type::Integer8 | Type::UnsignedInteger8 => {
+            edit.add_dynamic_constant(DynamicConstant::Constant(1))
+        }
+        Type::Integer16 | Type::UnsignedInteger16 => {
+            edit.add_dynamic_constant(DynamicConstant::Constant(2))
+        }
+        Type::Integer32 | Type::UnsignedInteger32 | Type::Float32 => {
+            edit.add_dynamic_constant(DynamicConstant::Constant(4))
+        }
+        Type::Integer64 | Type::UnsignedInteger64 | Type::Float64 => {
+            edit.add_dynamic_constant(DynamicConstant::Constant(8))
+        }
+        Type::Product(fields) => {
+            // The layout of product types is like the C-style layout.
+            let mut acc_size = edit.add_dynamic_constant(DynamicConstant::Constant(0));
+            for field in fields {
+                // Round up to the alignment of the field, then add the size of
+                // the field.
+                let field_size = type_size(edit, field, alignments);
+                acc_size = align(edit, acc_size, alignments[field.idx()]);
+                acc_size = edit.add_dynamic_constant(DynamicConstant::Add(acc_size, field_size));
+            }
+            // Finally, round up to the alignment of the whole product, since
+            // the size needs to be a multiple of the alignment.
+            acc_size = align(edit, acc_size, alignments[ty_id.idx()]);
+            acc_size
+        }
+        Type::Summation(variants) => {
+            // A summation holds every variant in the same memory.
+            let mut acc_size = edit.add_dynamic_constant(DynamicConstant::Constant(0));
+            for variant in variants {
+                // Pick the size of the largest variant, since that's the most
+                // memory we would need.
+                let variant_size = type_size(edit, variant, alignments);
+                acc_size = edit.add_dynamic_constant(DynamicConstant::Max(acc_size, variant_size));
+            }
+            // Add one byte for the discriminant and align the whole summation.
+            let one = edit.add_dynamic_constant(DynamicConstant::Constant(1));
+            acc_size = edit.add_dynamic_constant(DynamicConstant::Add(acc_size, one));
+            acc_size = align(edit, acc_size, alignments[ty_id.idx()]);
+            acc_size
+        }
+        Type::Array(elem, bounds) => {
+            // The layout of an array is row-major linear in memory.
+            let mut acc_size = type_size(edit, elem, alignments);
+            for bound in bounds {
+                acc_size = edit.add_dynamic_constant(DynamicConstant::Mul(acc_size, bound));
+            }
+            acc_size
+        }
+    };
+    size
+}
+
+/*
+ * Allocate objects in a function. Relies on the allocations of all called
+ * functions.
+ */
+fn object_allocation(
+    editor: &mut FunctionEditor,
+    typing: &Vec<TypeID>,
+    node_colors: &FunctionNodeColors,
+    alignments: &Vec<usize>,
+    liveness: &Liveness,
+    backing_allocations: &BackingAllocations,
+) -> FunctionBackingAllocation {
+    let mut fba = BTreeMap::new();
+
+    let node_ids = editor.node_ids();
+    editor.edit(|mut edit| {
+        // For now, just allocate each object to its own slot.
+        let zero = edit.add_dynamic_constant(DynamicConstant::Constant(0));
+        for id in node_ids {
+            match *edit.get_node(id) {
+                Node::Constant { id: _ } => {
+                    if !edit.get_type(typing[id.idx()]).is_primitive() {
+                        let device = node_colors[&id];
+                        let (total, offsets) =
+                            fba.entry(device).or_insert_with(|| (zero, BTreeMap::new()));
+                        *total = align(&mut edit, *total, alignments[typing[id.idx()].idx()]);
+                        offsets.insert(id, *total);
+                        let type_size = type_size(&mut edit, typing[id.idx()], alignments);
+                        *total = edit.add_dynamic_constant(DynamicConstant::Add(*total, type_size));
+                    }
+                }
+                Node::Call {
+                    control: _,
+                    function: callee,
+                    dynamic_constants: _,
+                    args: _,
+                } => {
+                    for device in BACKED_DEVICES {
+                        if let Some(callee_backing_size) = backing_allocations[&callee]
+                            .get(&device)
+                            .map(|(callee_total, _)| *callee_total)
+                        {
+                            let (total, offsets) =
+                                fba.entry(device).or_insert_with(|| (zero, BTreeMap::new()));
+                            // We don't know the alignment requirement of the memory
+                            // in the callee, so just assume the largest alignment.
+                            *total = align(&mut edit, *total, LARGEST_ALIGNMENT);
+                            offsets.insert(id, *total);
+                            *total = edit.add_dynamic_constant(DynamicConstant::Add(
+                                *total,
+                                callee_backing_size,
+                            ));
+                        }
+                    }
+                }
+                _ => {}
+            }
+        }
+        Ok(edit)
+    });
+
+    fba
+}
--- a/hercules_opt/src/lib.rs
+++ b/hercules_opt/src/lib.rs
@@ -4,6 +4,7 @@ pub mod ccp;
 pub mod crc;
 pub mod dce;
 pub mod delete_uncalled;
+pub mod device_placement;
 pub mod editor;
 pub mod float_collections;
 pub mod fork_concat_split;
@@ -27,6 +28,7 @@ pub use crate::ccp::*;
 pub use crate::crc::*;
 pub use crate::dce::*;
 pub use crate::delete_uncalled::*;
+pub use crate::device_placement::*;
 pub use crate::editor::*;
 pub use crate::float_collections::*;
 pub use crate::fork_concat_split::*;

--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
-use std::alloc::{alloc, alloc_zeroed, dealloc, Layout};
+use std::alloc::{alloc, dealloc, Layout};
 use std::marker::PhantomData;
 use std::ptr::{copy_nonoverlapping, NonNull};
-use std::slice::from_raw_parts;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-#[cfg(feature = "cuda")]
-extern "C" {
-    fn cuda_alloc(size: usize) -> *mut u8;
-    fn cuda_alloc_zeroed(size: usize) -> *mut u8;
-    fn cuda_dealloc(ptr: *mut u8);
-    fn copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
-    fn copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
-    fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
-}
+use std::slice::{from_raw_parts, from_raw_parts_mut};

 /*
- * Each object needs to get assigned a unique ID.
+ * Define supporting types, functions, and macros for Hercules RT functions. For
+ * a more in-depth discussion of the design of these utilities, see hercules_cg/
+ * src/rt.rs (the RT backend).
 */
-static NUM_OBJECTS: AtomicUsize = AtomicUsize::new(1);

-/*
- * An in-memory collection object that can be used by functions compiled by the
- * Hercules compiler. Memory objects can be in these states:
- *
- * 1. Shared CPU - the object has a shared reference to some CPU memory, usually
- *    from the programmer using the Hercules RT API.
- * 2. Exclusive CPU - the object has an exclusive reference to some CPU memory,
- *    usually from the programmer using the Hercules RT API.
- * 3. Owned CPU - the object owns some allocated CPU memory.
- * 4. Owned GPU - the object owns some allocated GPU memory.
- *
- * A single object can be in some combination of these objects at the same time.
- * Only some combinations are valid, because only some combinations are
- * reachable. Under this assumption, we can model an object's placement as a
- * state machine, where states are combinations of the aforementioned states,
- * and actions are requests on the CPU or GPU, immutably or mutably. Here's the
- * state transition table:
- *
- * Shared CPU = CS
- * Exclusive CPU = CE
- * Owned CPU = CO
- * Owned GPU = GO
- *
- *          CPU     Mut CPU     GPU     Mut GPU
- *       *---------------------------------------
- *  CS   |   CS        CO      CS,GO       GO
- *  CE   |   CE        CE      CE,GO       GO
- *  CO   |   CO        CO      CO,GO       GO
- *  GO   |   CO        CO        GO        GO
- * CS,GO | CS,GO       CO      CS,GO       GO
- * CE,GO | CE,GO       CE      CE,GO       GO
- * CO,GO | CO,GO       CO      CO,GO       GO
- *       |
- *
- * A HerculesBox cannot be cloned, because it may have be a mutable reference to
- * some CPU memory.
- */
-#[derive(Debug)]
-pub struct HerculesBox<'a> {
-    cpu_shared: Option<NonOwned<'a>>,
-    cpu_exclusive: Option<NonOwned<'a>>,
-    cpu_owned: Option<Owned>,
+pub unsafe fn __cpu_alloc(size: usize) -> *mut u8 {
+    alloc(Layout::from_size_align(size, 16).unwrap())
+}

-    #[cfg(feature = "cuda")]
-    cuda_owned: Option<Owned>,
+pub unsafe fn __cpu_dealloc(ptr: *mut u8, size: usize) {
+    dealloc(ptr, Layout::from_size_align(size, 16).unwrap())
+}

-    size: usize,
-    id: usize,
+pub unsafe fn __copy_cpu_to_cpu(dst: *mut u8, src: *mut u8, size: usize) {
+    copy_nonoverlapping(src, dst, size);
+}
+
+#[cfg(feature = "cuda")]
+extern "C" {
+    pub fn __cuda_alloc(size: usize) -> *mut u8;
+    pub fn __cuda_dealloc(ptr: *mut u8, size: usize);
+    pub fn __copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    pub fn __copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
+    pub fn __copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
 }

 #[derive(Clone, Debug)]
-struct NonOwned<'a> {
+pub struct HerculesCPURef<'a> {
    ptr: NonNull<u8>,
-    offset: usize,
+    size: usize,
    _phantom: PhantomData<&'a u8>,
 }

+#[derive(Debug)]
+pub struct HerculesCPURefMut<'a> {
+    ptr: NonNull<u8>,
+    size: usize,
+    _phantom: PhantomData<&'a u8>,
+}
+
+#[cfg(feature = "cuda")]
 #[derive(Clone, Debug)]
-struct Owned {
+pub struct HerculesCUDARef<'a> {
    ptr: NonNull<u8>,
-    alloc_size: usize,
-    offset: usize,
+    size: usize,
+    _phantom: PhantomData<&'a u8>,
 }

-impl<'b, 'a: 'b> HerculesBox<'a> {
-    pub fn from_slice<T>(slice: &'a [T]) -> Self {
-        let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
-        let size = slice.len() * size_of::<T>();
-        let id = NUM_OBJECTS.fetch_add(1, Ordering::Relaxed);
-        HerculesBox {
-            cpu_shared: Some(NonOwned {
-                ptr,
-                offset: 0,
-                _phantom: PhantomData,
-            }),
-            cpu_exclusive: None,
-            cpu_owned: None,
-
-            #[cfg(feature = "cuda")]
-            cuda_owned: None,
+#[cfg(feature = "cuda")]
+#[derive(Debug)]
+pub struct HerculesCUDARefMut<'a> {
+    ptr: NonNull<u8>,
+    size: usize,
+    _phantom: PhantomData<&'a u8>,
+}

-            size,
-            id,
-        }
-    }
+#[cfg(feature = "cuda")]
+#[derive(Debug)]
+pub struct CUDABox {
+    ptr: NonNull<u8>,
+    size: usize,
+}

-    pub fn from_slice_mut<T>(slice: &'a mut [T]) -> Self {
+impl<'a> HerculesCPURef<'a> {
+    pub fn from_slice<T>(slice: &'a [T]) -> Self {
        let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
        let size = slice.len() * size_of::<T>();
-        let id = NUM_OBJECTS.fetch_add(1, Ordering::Relaxed);
-        HerculesBox {
-            cpu_shared: None,
-            cpu_exclusive: Some(NonOwned {
-                ptr,
-                offset: 0,
-                _phantom: PhantomData,
-            }),
-            cpu_owned: None,
-
-            #[cfg(feature = "cuda")]
-            cuda_owned: None,
-
+        Self {
+            ptr,
            size,
-            id,
+            _phantom: PhantomData,
        }
    }

-    pub fn as_slice<T>(&'b mut self) -> &'b [T] {
+    pub fn as_slice<T>(self) -> &'a [T] {
+        let ptr = self.ptr.as_ptr() as *const T;
        assert_eq!(self.size % size_of::<T>(), 0);
-        unsafe { from_raw_parts(self.__cpu_ptr() as *const T, self.size / size_of::<T>()) }
+        assert!(ptr.is_aligned());
+        unsafe { from_raw_parts(ptr, self.size / size_of::<T>()) }
    }

-    unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> {
-        self.cpu_owned
-            .as_ref()
-            .map(|obj| obj.ptr.byte_add(obj.offset))
-            .or(self
-                .cpu_exclusive
-                .as_ref()
-                .map(|obj| obj.ptr.byte_add(obj.offset)))
-            .or(self
-                .cpu_shared
-                .as_ref()
-                .map(|obj| obj.ptr.byte_add(obj.offset)))
+    pub unsafe fn __ptr(&self) -> *mut u8 {
+        self.ptr.as_ptr() as *mut u8
    }

-    #[cfg(feature = "cuda")]
-    unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> {
-        self.cuda_owned
-            .as_ref()
-            .map(|obj| obj.ptr.byte_add(obj.offset))
+    pub unsafe fn __size(&self) -> usize {
+        self.size
    }

-    unsafe fn allocate_cpu(&mut self) -> NonNull<u8> {
-        if let Some(obj) = self.cpu_owned.as_ref() {
-            obj.ptr
-        } else {
-            let ptr =
-                NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap();
-            self.cpu_owned = Some(Owned {
-                ptr,
-                alloc_size: self.size,
-                offset: 0,
-            });
-            ptr
+    pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
+        Self {
+            ptr: NonNull::new(ptr).unwrap(),
+            size,
+            _phantom: PhantomData,
        }
    }
+}

-    #[cfg(feature = "cuda")]
-    unsafe fn allocate_cuda(&mut self) -> NonNull<u8> {
-        if let Some(obj) = self.cuda_owned.as_ref() {
-            obj.ptr
-        } else {
-            let ptr = NonNull::new(cuda_alloc(self.size)).unwrap();
-            self.cuda_owned = Some(Owned {
-                ptr,
-                alloc_size: self.size,
-                offset: 0,
-            });
-            ptr
+impl<'a> HerculesCPURefMut<'a> {
+    pub fn from_slice<T>(slice: &'a mut [T]) -> Self {
+        let ptr = unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) };
+        let size = slice.len() * size_of::<T>();
+        Self {
+            ptr,
+            size,
+            _phantom: PhantomData,
        }
    }

-    unsafe fn deallocate_cpu(&mut self) {
-        if let Some(obj) = self.cpu_owned.take() {
-            dealloc(
-                obj.ptr.as_ptr(),
-                Layout::from_size_align_unchecked(obj.alloc_size, 16),
-            );
-        }
+    pub fn as_slice<T>(self) -> &'a mut [T] {
+        let ptr = self.ptr.as_ptr() as *mut T;
+        assert_eq!(self.size % size_of::<T>(), 0);
+        assert!(ptr.is_aligned());
+        unsafe { from_raw_parts_mut(ptr, self.size / size_of::<T>()) }
    }

-    #[cfg(feature = "cuda")]
-    unsafe fn deallocate_cuda(&mut self) {
-        if let Some(obj) = self.cuda_owned.take() {
-            cuda_dealloc(obj.ptr.as_ptr());
+    pub fn as_ref(self) -> HerculesCPURef<'a> {
+        HerculesCPURef {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
        }
    }

-    pub unsafe fn __zeros(size: u64) -> Self {
-        let size = size as usize;
-        let id = NUM_OBJECTS.fetch_add(1, Ordering::Relaxed);
-        HerculesBox {
-            cpu_shared: None,
-            cpu_exclusive: None,
-            cpu_owned: Some(Owned {
-                ptr: NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16)))
-                    .unwrap(),
-                alloc_size: size,
-                offset: 0,
-            }),
-
-            #[cfg(feature = "cuda")]
-            cuda_owned: None,
+    pub unsafe fn __ptr(&self) -> *mut u8 {
+        self.ptr.as_ptr()
+    }
+
+    pub unsafe fn __size(&self) -> usize {
+        self.size
+    }

+    pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
+        Self {
+            ptr: NonNull::new(ptr).unwrap(),
            size,
-            id,
+            _phantom: PhantomData,
        }
    }
+}

-    pub unsafe fn __null() -> Self {
-        HerculesBox {
-            cpu_shared: None,
-            cpu_exclusive: None,
-            cpu_owned: None,
-
-            #[cfg(feature = "cuda")]
-            cuda_owned: None,
+#[cfg(feature = "cuda")]
+impl<'a> HerculesCUDARef<'a> {
+    pub unsafe fn __ptr(&self) -> *mut u8 {
+        self.ptr.as_ptr()
+    }

-            size: 0,
-            id: 0,
-        }
+    pub unsafe fn __size(&self) -> usize {
+        self.size
    }

-    pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 {
-        if let Some(ptr) = self.get_cpu_ptr() {
-            return ptr.as_ptr();
-        }
-        #[cfg(feature = "cuda")]
-        {
-            let cuda_ptr = self.get_cuda_ptr().unwrap();
-            let cpu_ptr = self.allocate_cpu();
-            copy_cuda_to_cpu(cpu_ptr.as_ptr(), cuda_ptr.as_ptr(), self.size);
-            return cpu_ptr.as_ptr();
+    pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
+        Self {
+            ptr: NonNull::new(ptr).unwrap(),
+            size,
+            _phantom: PhantomData,
        }
-        panic!()
    }
+}

-    pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 {
-        let cpu_ptr = self.__cpu_ptr();
-        if Some(cpu_ptr) == self.cpu_shared.as_ref().map(|obj| obj.ptr.as_ptr()) {
-            self.allocate_cpu();
-            copy_nonoverlapping(
-                cpu_ptr,
-                self.cpu_owned.as_ref().unwrap().ptr.as_ptr(),
-                self.size,
-            );
+#[cfg(feature = "cuda")]
+impl<'a> HerculesCUDARefMut<'a> {
+    pub fn as_ref(self) -> HerculesCUDARef<'a> {
+        HerculesCUDARef {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
        }
-        self.cpu_shared = None;
-        #[cfg(feature = "cuda")]
-        self.deallocate_cuda();
-        cpu_ptr
    }

-    #[cfg(feature = "cuda")]
-    pub unsafe fn __cuda_ptr(&mut self) -> *mut u8 {
-        if let Some(ptr) = self.get_cuda_ptr() {
-            ptr.as_ptr()
-        } else {
-            let cpu_ptr = self.get_cpu_ptr().unwrap();
-            let cuda_ptr = self.allocate_cuda();
-            copy_cpu_to_cuda(cuda_ptr.as_ptr(), cpu_ptr.as_ptr(), self.size);
-            cuda_ptr.as_ptr()
-        }
+    pub unsafe fn __ptr(&self) -> *mut u8 {
+        self.ptr.as_ptr()
    }

-    #[cfg(feature = "cuda")]
-    pub unsafe fn __cuda_ptr_mut(&mut self) -> *mut u8 {
-        let cuda_ptr = self.__cuda_ptr();
-        self.cpu_shared = None;
-        self.cpu_exclusive = None;
-        self.deallocate_cpu();
-        cuda_ptr
+    pub unsafe fn __size(&self) -> usize {
+        self.size
    }

-    pub unsafe fn __clone(&self) -> Self {
+    pub unsafe fn __from_parts(ptr: *mut u8, size: usize) -> Self {
        Self {
-            cpu_shared: self.cpu_shared.clone(),
-            cpu_exclusive: self.cpu_exclusive.clone(),
-            cpu_owned: self.cpu_owned.clone(),
-            #[cfg(feature = "cuda")]
-            cuda_owned: self.cuda_owned.clone(),
-            size: self.size,
-            id: self.id,
+            ptr: NonNull::new(ptr).unwrap(),
+            size,
+            _phantom: PhantomData,
        }
    }
+}

-    pub unsafe fn __forget(&mut self) {
-        self.cpu_owned = None;
-        #[cfg(feature = "cuda")]
-        {
-            self.cuda_owned = None;
+#[cfg(feature = "cuda")]
+impl CUDABox {
+    pub fn from_cpu_ref(cpu_ref: HerculesCPURef) -> Self {
+        unsafe {
+            let size = cpu_ref.size;
+            let ptr = NonNull::new(__cuda_alloc(size)).unwrap();
+            __copy_cpu_to_cuda(ptr.as_ptr(), cpu_ref.ptr.as_ptr(), size);
+            Self { ptr, size }
        }
    }

-    pub unsafe fn __offset(&mut self, offset: u64, size: u64) {
-        if let Some(obj) = self.cpu_shared.as_mut() {
-            obj.offset += offset as usize;
-        }
-        if let Some(obj) = self.cpu_exclusive.as_mut() {
-            obj.offset += offset as usize;
+    pub fn from_cuda_ref(cuda_ref: HerculesCUDARef) -> Self {
+        unsafe {
+            let size = cuda_ref.size;
+            let ptr = NonNull::new(__cuda_alloc(size)).unwrap();
+            __copy_cuda_to_cuda(ptr.as_ptr(), cuda_ref.ptr.as_ptr(), size);
+            Self { ptr, size }
        }
-        if let Some(obj) = self.cpu_owned.as_mut() {
-            obj.offset += offset as usize;
+    }
+
+    pub fn get_ref<'a>(&'a self) -> HerculesCUDARef<'a> {
+        HerculesCUDARef {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
        }
-        #[cfg(feature = "cuda")]
-        if let Some(obj) = self.cuda_owned.as_mut() {
-            obj.offset += offset as usize;
+    }
+
+    pub fn get_ref_mut<'a>(&'a mut self) -> HerculesCUDARefMut<'a> {
+        HerculesCUDARefMut {
+            ptr: self.ptr,
+            size: self.size,
+            _phantom: PhantomData,
        }
-        self.size = size as usize;
    }
+}

-    pub unsafe fn __cmp_ids(&self, other: &HerculesBox<'_>) -> bool {
-        self.id == other.id
+#[cfg(feature = "cuda")]
+impl Clone for CUDABox {
+    fn clone(&self) -> Self {
+        Self::from_cuda_ref(self.get_ref())
    }
 }

-impl<'a> Drop for HerculesBox<'a> {
+#[cfg(feature = "cuda")]
+impl Drop for CUDABox {
    fn drop(&mut self) {
        unsafe {
-            self.deallocate_cpu();
-            #[cfg(feature = "cuda")]
-            self.deallocate_cuda();
+            __cuda_dealloc(self.ptr.as_ptr(), self.size);
        }
    }
 }
+
+#[macro_export]
+macro_rules! runner {
+    ($x: ident) => {
+        <concat_idents!(HerculesRunner_, $x)>::new()
+    };
+}
--- a/hercules_rt/src/rtdefs.cu
+++ b/hercules_rt/src/rtdefs.cu
 extern "C" {
-	void *cuda_alloc(size_t size) {
+	void *__cuda_alloc(size_t size) {
 		void *ptr = NULL;
 		cudaError_t res = cudaMalloc(&ptr, size);
 		if (res != cudaSuccess) {
@@ -8,31 +8,20 @@ extern "C" {
 		return ptr;
 	}
 	
-	void *cuda_alloc_zeroed(size_t size) {
-		void *ptr = cuda_alloc(size);
-		if (!ptr) {
-			return NULL;
-		}
-		cudaError_t res = cudaMemset(ptr, 0, size);
-		if (res != cudaSuccess) {
-			return NULL;
-		}
-		return ptr;
-	}
-	
-	void cuda_dealloc(void *ptr) {
+	void __cuda_dealloc(void *ptr, size_t size) {
+		(void) size;
 		cudaFree(ptr);
 	}
 	
-	void copy_cpu_to_cuda(void *dst, void *src, size_t size) {
+	void __copy_cpu_to_cuda(void *dst, void *src, size_t size) {
 		cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
 	}
 	
-	void copy_cuda_to_cpu(void *dst, void *src, size_t size) {
+	void __copy_cuda_to_cpu(void *dst, void *src, size_t size) {
 		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost);
 	}
 	
-	void copy_cuda_to_cuda(void *dst, void *src, size_t size) {
+	void __copy_cuda_to_cuda(void *dst, void *src, size_t size) {
 		cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice);
 	}
 }
--- a/hercules_samples/call/src/main.rs
+++ b/hercules_samples/call/src/main.rs
-#![feature(box_as_ptr, let_chains)]
+#![feature(concat_idents)]
+
+use hercules_rt::runner;

 juno_build::juno!("call");

 fn main() {
    async_std::task::block_on(async {
-        let x = myfunc(7).await;
-        let y = add(10, 2, 18).await;
+        let mut r = runner!(myfunc);
+        let x = r.run(7).await;
+        let mut r = runner!(add);
+        let y = r.run(10, 2, 18).await;
        assert_eq!(x, y);
    });
 }

--- a/hercules_samples/ccp/src/main.rs
+++ b/hercules_samples/ccp/src/main.rs
-#![feature(box_as_ptr, let_chains)]
+#![feature(concat_idents)]
+
+use hercules_rt::runner;

 juno_build::juno!("ccp");

 fn main() {
    async_std::task::block_on(async {
-        let x = tricky(7).await;
+        let mut r = runner!(tricky);
+        let x = r.run(7).await;
        assert_eq!(x, 1);
    });
 }

--- a/hercules_samples/dot/src/main.rs
+++ b/hercules_samples/dot/src/main.rs
-#![feature(box_as_ptr, let_chains)]
+#![feature(concat_idents)]

-use hercules_rt::HerculesBox;
+use hercules_rt::{runner, HerculesCPURef};

 juno_build::juno!("dot");

@@ -8,9 +8,10 @@ fn main() {
    async_std::task::block_on(async {
        let a: [f32; 8] = [0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0, 4.0];
        let b: [f32; 8] = [0.0, 5.0, 0.0, 6.0, 0.0, 7.0, 0.0, 8.0];
-        let a = HerculesBox::from_slice(&a);
-        let b = HerculesBox::from_slice(&b);
-        let c = dot(8, a, b).await;
+        let a = HerculesCPURef::from_slice(&a);
+        let b = HerculesCPURef::from_slice(&b);
+        let mut r = runner!(dot);
+        let c = r.run(8, a, b).await;
        println!("{}", c);
        assert_eq!(c, 70.0);
    });

--- a/hercules_samples/fac/src/main.rs
+++ b/hercules_samples/fac/src/main.rs
+#![feature(concat_idents)]
+
+use hercules_rt::runner;
+
 juno_build::juno!("fac");

 fn main() {
    async_std::task::block_on(async {
-        let f = fac(8).await;
+        let mut r = runner!(fac);
+        let f = r.run(8).await;
        println!("{}", f);
        assert_eq!(f, 40320);
    });
No results found