Compare revisions

rarbore2 · rarbore2 · rarbore2 · rarbore2 · Aaron Councilman · rarbore2
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-build-job:
-  stage: build
-  script:
-    - cargo build
-
 test-job:
  stage: test
  script:
    - cargo test
+    - cargo test --features=cuda
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,4 +27,7 @@ members = [
 	"juno_samples/nested_ccp",
 	"juno_samples/antideps",
 	"juno_samples/implicit_clone",
+	"juno_samples/concat",
+
+	"juno_samples/cava",
 ]
--- a/hercules_cg/src/cpu.rs
+++ b/hercules_cg/src/cpu.rs
@@ -536,6 +536,17 @@ impl<'a> CPUContext<'a> {
                    self.get_value(collect, true)
                )?;
            }
+            Node::Undef { ty } => {
+                let body = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap().body;
+                let ty = self.get_type(ty);
+                write!(
+                    body,
+                    "  {} = bitcast {} undef to {}\n",
+                    self.get_value(id, false),
+                    ty,
+                    ty
+                )?;
+            }
            _ => panic!("PANIC: Can't lower {:?}.", self.function.nodes[id.idx()]),
        }
        Ok(())
@@ -602,6 +613,20 @@ impl<'a> CPUContext<'a> {
                    left.idx(),
                    right.idx()
                )?,
+                DynamicConstant::Min(left, right) => write!(
+                    body,
+                    "  %dc{} = call @llvm.umin.i64(i64%dc{},i64%dc{})\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
+                DynamicConstant::Max(left, right) => write!(
+                    body,
+                    "  %dc{} = call @llvm.umax.i64(i64%dc{},i64%dc{})\n",
+                    dc.idx(),
+                    left.idx(),
+                    right.idx()
+                )?,
            }
        }
        Ok(())
@@ -848,7 +873,7 @@ fn convert_intrinsic(intrinsic: &Intrinsic, ty: &Type) -> String {
        Intrinsic::Log2 => "log2",
        Intrinsic::Max => {
            if ty.is_float() {
-                "max"
+                "maxnum"
            } else if ty.is_unsigned() {
                "umax"
            } else if ty.is_signed() {
@@ -859,7 +884,7 @@ fn convert_intrinsic(intrinsic: &Intrinsic, ty: &Type) -> String {
        }
        Intrinsic::Min => {
            if ty.is_float() {
-                "min"
+                "minnum"
            } else if ty.is_unsigned() {
                "umin"
            } else if ty.is_signed() {

--- a/hercules_cg/src/rt.rs
+++ b/hercules_cg/src/rt.rs
@@ -295,10 +295,19 @@ impl<'a> RTContext<'a> {
                ref dynamic_constants,
                ref args,
            } => {
-                match self.devices[callee_id.idx()] {
-                    Device::LLVM => {
+                let device = self.devices[callee_id.idx()];
+                match device {
+                    // The device backends ensure that device functions have the
+                    // same C interface.
+                    Device::LLVM | Device::CUDA => {
                        let block = &mut blocks.get_mut(&self.bbs.0[id.idx()]).unwrap();

+                        let device = match device {
+                            Device::LLVM => "cpu",
+                            Device::CUDA => "cuda",
+                            _ => panic!(),
+                        };
+
                        // First, get the raw pointers to collections that the
                        // device function takes as input.
                        let callee_objs = &self.collection_objects[&callee_id];
@@ -308,16 +317,18 @@ impl<'a> RTContext<'a> {
                                if callee_objs.is_mutated(obj) {
                                    write!(
                                        block,
-                                        "                let arg_tmp{} = unsafe {{ {}.__cpu_ptr_mut() }};\n",
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr_mut() }};\n",
                                        idx,
-                                        self.get_value(*arg)
+                                        self.get_value(*arg),
+                                        device
                                    )?;
                                } else {
                                    write!(
                                        block,
-                                        "                let arg_tmp{} = unsafe {{ {}.__cpu_ptr() }};\n",
+                                        "                let arg_tmp{} = unsafe {{ {}.__{}_ptr() }};\n",
                                        idx,
-                                        self.get_value(*arg)
+                                        self.get_value(*arg),
+                                        device
                                    )?;
                                }
                            } else {
@@ -401,7 +412,6 @@ impl<'a> RTContext<'a> {
                        }
                        write!(block, ").await;\n")?;
                    }
-                    _ => todo!(),
                }
            }
            _ => panic!(
@@ -459,6 +469,20 @@ impl<'a> RTContext<'a> {
                self.codegen_dynamic_constant(right, w)?;
                write!(w, ")")?;
            }
+            DynamicConstant::Min(left, right) => {
+                write!(w, "::core::cmp::min(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, ",")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
+            DynamicConstant::Max(left, right) => {
+                write!(w, "::core::cmp::max(")?;
+                self.codegen_dynamic_constant(left, w)?;
+                write!(w, ",")?;
+                self.codegen_dynamic_constant(right, w)?;
+                write!(w, ")")?;
+            }
        }
        Ok(())
    }

--- a/hercules_ir/src/ir.rs
+++ b/hercules_ir/src/ir.rs
+use std::cmp::{max, min};
 use std::fmt::Write;
 use std::ops::Coroutine;
 use std::ops::CoroutineState;
@@ -121,6 +122,8 @@ pub enum DynamicConstant {
    Mul(DynamicConstantID, DynamicConstantID),
    Div(DynamicConstantID, DynamicConstantID),
    Rem(DynamicConstantID, DynamicConstantID),
+    Min(DynamicConstantID, DynamicConstantID),
+    Max(DynamicConstantID, DynamicConstantID),
 }

 /*
@@ -130,7 +133,7 @@ pub enum DynamicConstant {
 * operate on an index list, composing indices at different levels in a type
 * tree. Each type that can be indexed has a unique variant in the index enum.
 */
-#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
 pub enum Index {
    Field(usize),
    Variant(usize),
@@ -329,7 +332,7 @@ pub enum Schedule {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub enum Device {
    LLVM,
-    NVVM,
+    CUDA,
    // Entry functions are lowered to async Rust code that calls device
    // functions (leaf nodes in the call graph), possibly concurrently.
    AsyncRust,
@@ -445,13 +448,17 @@ impl Module {
            | DynamicConstant::Sub(x, y)
            | DynamicConstant::Mul(x, y)
            | DynamicConstant::Div(x, y)
-            | DynamicConstant::Rem(x, y) => {
+            | DynamicConstant::Rem(x, y)
+            | DynamicConstant::Min(x, y)
+            | DynamicConstant::Max(x, y) => {
                match &self.dynamic_constants[dc_id.idx()] {
                    DynamicConstant::Add(_, _) => write!(w, "+")?,
                    DynamicConstant::Sub(_, _) => write!(w, "-")?,
                    DynamicConstant::Mul(_, _) => write!(w, "*")?,
                    DynamicConstant::Div(_, _) => write!(w, "/")?,
                    DynamicConstant::Rem(_, _) => write!(w, "%")?,
+                    DynamicConstant::Min(_, _) => write!(w, "min")?,
+                    DynamicConstant::Max(_, _) => write!(w, "max")?,
                    _ => (),
                }
                write!(w, "(")?;
@@ -1014,6 +1021,14 @@ pub fn evaluate_dynamic_constant(
        DynamicConstant::Rem(left, right) => {
            Some(evaluate_dynamic_constant(left, dcs)? % evaluate_dynamic_constant(right, dcs)?)
        }
+        DynamicConstant::Min(left, right) => Some(min(
+            evaluate_dynamic_constant(left, dcs)?,
+            evaluate_dynamic_constant(right, dcs)?,
+        )),
+        DynamicConstant::Max(left, right) => Some(max(
+            evaluate_dynamic_constant(left, dcs)?,
+            evaluate_dynamic_constant(right, dcs)?,
+        )),
    }
 }


--- a/hercules_ir/src/typecheck.rs
+++ b/hercules_ir/src/typecheck.rs
+use std::cmp::{max, min};
 use std::collections::HashMap;
 use std::iter::zip;

@@ -193,7 +194,9 @@ fn typeflow(
            | DynamicConstant::Sub(x, y)
            | DynamicConstant::Mul(x, y)
            | DynamicConstant::Div(x, y)
-            | DynamicConstant::Rem(x, y) => {
+            | DynamicConstant::Rem(x, y)
+            | DynamicConstant::Min(x, y)
+            | DynamicConstant::Max(x, y) => {
                check_dynamic_constants(x, dynamic_constants, num_parameters)
                    && check_dynamic_constants(y, dynamic_constants, num_parameters)
            }
@@ -513,12 +516,8 @@ fn typeflow(
                Constant::Float64(_) => {
                    Concrete(get_type_id(Type::Float64, types, reverse_type_map))
                }
-                // Product, summation, and array constants are exceptions.
-                // Technically, only summation constants need to explicitly
-                // store their type, but product and array constants also
-                // explicitly store their type specifically to make this code
-                // simpler (although their type could be derived from the
-                // constant itself).
+                // Product, summation, and array constants are exceptions. they
+                // all explicitly store their type.
                Constant::Product(id, _) => {
                    if let Type::Product(_) = types[id.idx()] {
                        Concrete(id)
@@ -537,7 +536,6 @@ fn typeflow(
                        ))
                    }
                }
-                // Array typechecking also consists of validating the number of constant elements.
                Constant::Array(id) => {
                    if let Type::Array(_, _) = &types[id.idx()] {
                        Concrete(id)
@@ -1126,27 +1124,55 @@ fn types_match(
 /*
 * Determine if the given dynamic constant matches the parameter's dynamic
 * constants when the provided dynamic constants are substituted in for the
- * dynamic constants used in the parameter's dynamic constant
+ * dynamic constants used in the parameter's dynamic constant. Implement dynamic
+ * constant normalization here as well - i.e., 1 * 2 * 3 = 6.
 */
 fn dyn_consts_match(
    dynamic_constants: &Vec<DynamicConstant>,
    dc_args: &Box<[DynamicConstantID]>,
-    param: DynamicConstantID,
-    input: DynamicConstantID,
+    left: DynamicConstantID,
+    right: DynamicConstantID,
 ) -> bool {
+    // First, try evaluating the DCs and seeing if they're the same value.
+    if let (Some(cons1), Some(cons2)) = (
+        evaluate_dynamic_constant(left, dynamic_constants),
+        evaluate_dynamic_constant(right, dynamic_constants),
+    ) {
+        return cons1 == cons2;
+    }
+
    match (
-        &dynamic_constants[param.idx()],
-        &dynamic_constants[input.idx()],
+        &dynamic_constants[left.idx()],
+        &dynamic_constants[right.idx()],
    ) {
        (DynamicConstant::Constant(x), DynamicConstant::Constant(y)) => x == y,
-        (DynamicConstant::Parameter(i), _) => input == dc_args[*i],
-        (DynamicConstant::Add(pl, pr), DynamicConstant::Add(il, ir))
-        | (DynamicConstant::Sub(pl, pr), DynamicConstant::Sub(il, ir))
-        | (DynamicConstant::Mul(pl, pr), DynamicConstant::Mul(il, ir))
-        | (DynamicConstant::Div(pl, pr), DynamicConstant::Div(il, ir))
-        | (DynamicConstant::Rem(pl, pr), DynamicConstant::Rem(il, ir)) => {
-            dyn_consts_match(dynamic_constants, dc_args, *pl, *il)
-                && dyn_consts_match(dynamic_constants, dc_args, *pr, *ir)
+        (DynamicConstant::Parameter(l), DynamicConstant::Parameter(r)) => l == r,
+        (DynamicConstant::Parameter(i), _) => dyn_consts_match(
+            dynamic_constants,
+            dc_args,
+            min(right, dc_args[*i]),
+            max(right, dc_args[*i]),
+        ),
+        (_, DynamicConstant::Parameter(i)) => dyn_consts_match(
+            dynamic_constants,
+            dc_args,
+            min(left, dc_args[*i]),
+            max(left, dc_args[*i]),
+        ),
+        (DynamicConstant::Add(ll, lr), DynamicConstant::Add(rl, rr))
+        | (DynamicConstant::Mul(ll, lr), DynamicConstant::Mul(rl, rr))
+        | (DynamicConstant::Min(ll, lr), DynamicConstant::Min(rl, rr))
+        | (DynamicConstant::Max(ll, lr), DynamicConstant::Max(rl, rr)) => {
+            // Normalize for associative ops by always looking at smaller DC ID
+            // as left arm and larger DC ID as right arm.
+            dyn_consts_match(dynamic_constants, dc_args, min(*ll, *lr), min(*rl, *rr))
+                && dyn_consts_match(dynamic_constants, dc_args, max(*ll, *lr), max(*rl, *rr))
+        }
+        (DynamicConstant::Sub(ll, lr), DynamicConstant::Sub(rl, rr))
+        | (DynamicConstant::Div(ll, lr), DynamicConstant::Div(rl, rr))
+        | (DynamicConstant::Rem(ll, lr), DynamicConstant::Rem(rl, rr)) => {
+            dyn_consts_match(dynamic_constants, dc_args, *ll, *rl)
+                && dyn_consts_match(dynamic_constants, dc_args, *lr, *rr)
        }
        (_, _) => false,
    }
@@ -1328,5 +1354,27 @@ fn dyn_const_subst(
                reverse_dynamic_constant_map,
            )
        }
+        DynamicConstant::Min(l, r) => {
+            let x = *l;
+            let y = *r;
+            let sx = dyn_const_subst(dynamic_constants, reverse_dynamic_constant_map, dc_args, x);
+            let sy = dyn_const_subst(dynamic_constants, reverse_dynamic_constant_map, dc_args, y);
+            intern_dyn_const(
+                DynamicConstant::Min(sx, sy),
+                dynamic_constants,
+                reverse_dynamic_constant_map,
+            )
+        }
+        DynamicConstant::Max(l, r) => {
+            let x = *l;
+            let y = *r;
+            let sx = dyn_const_subst(dynamic_constants, reverse_dynamic_constant_map, dc_args, x);
+            let sy = dyn_const_subst(dynamic_constants, reverse_dynamic_constant_map, dc_args, y);
+            intern_dyn_const(
+                DynamicConstant::Max(sx, sy),
+                dynamic_constants,
+                reverse_dynamic_constant_map,
+            )
+        }
    }
 }
--- a/hercules_opt/Cargo.toml
+++ b/hercules_opt/Cargo.toml
@@ -11,6 +11,7 @@ tempfile = "*"
 either = "*"
 itertools = "*"
 take_mut = "*"
+union-find = "*"
 postcard = { version = "*", features = ["alloc"] }
 serde = { version = "*", features = ["derive"] }
 hercules_cg = { path = "../hercules_cg" }

--- a/hercules_opt/src/ccp.rs
+++ b/hercules_opt/src/ccp.rs
--- a/hercules_opt/src/gcm.rs
+++ b/hercules_opt/src/gcm.rs
@@ -3,6 +3,7 @@ use std::iter::{empty, once, zip, FromIterator};

 use bitvec::prelude::*;
 use either::Either;
+use union_find::{QuickFindUf, UnionBySize, UnionFind};

 use hercules_cg::*;
 use hercules_ir::*;
@@ -551,6 +552,35 @@ fn mutating_objects<'a>(
    }
 }

+fn mutating_writes<'a>(
+    function: &'a Function,
+    mutator: NodeID,
+    objects: &'a CollectionObjects,
+) -> Box<dyn Iterator<Item = NodeID> + 'a> {
+    match function.nodes[mutator.idx()] {
+        Node::Write {
+            collect,
+            data: _,
+            indices: _,
+        } => Box::new(once(collect)),
+        Node::Call {
+            control: _,
+            function: callee,
+            dynamic_constants: _,
+            ref args,
+        } => Box::new(args.into_iter().enumerate().filter_map(move |(idx, arg)| {
+            let callee_objects = &objects[&callee];
+            let param_obj = callee_objects.param_to_object(idx)?;
+            if callee_objects.is_mutated(param_obj) {
+                Some(*arg)
+            } else {
+                None
+            }
+        })),
+        _ => Box::new(empty()),
+    }
+}
+
 type Liveness = BTreeMap<NodeID, Vec<BTreeSet<NodeID>>>;

 /*
@@ -579,27 +609,60 @@ fn spill_clones(
    // Step 2: compute an interference graph from the liveness result. This
    // graph contains a vertex per node ID producing a collection value and an
    // edge per pair of node IDs that interfere. Nodes A and B interfere if node
-    // A is defined right above a point where node B is live.
+    // A is defined right above a point where node B is live and A != B. Extra
+    // edges are drawn for forwarding reads - when there is a node A that is a
+    // forwarding read of a node B, A and B really have the same live range for
+    // the purpose of determining when spills are necessary, since forwarding
+    // reads can be thought of as nothing but pointer math. For this purpose, we
+    // maintain a union-find of nodes that form a forwarding read DAG (notably,
+    // phis and reduces are not considered forwarding reads). The more precise
+    // version of the interference condition is nodes A and B interfere is node
+    // A is defined right above a point where a node C is live where C is in the
+    // same union-find class as B.
+
+    // Assemble the union-find to group forwarding read DAGs.
+    let mut union_find = QuickFindUf::<UnionBySize>::new(editor.func().nodes.len());
+    for id in editor.node_ids() {
+        for forwarding_read in forwarding_reads(editor.func(), editor.func_id(), id, objects) {
+            union_find.union(id.idx(), forwarding_read.idx());
+        }
+    }
+
+    // Figure out which classes contain which node IDs, since we need to iterate
+    // the disjoint sets.
+    let mut disjoint_sets: BTreeMap<usize, Vec<NodeID>> = BTreeMap::new();
+    for id in editor.node_ids() {
+        disjoint_sets
+            .entry(union_find.find(id.idx()))
+            .or_default()
+            .push(id);
+    }
+
+    // Create the graph.
    let mut edges = vec![];
    for (bb, liveness) in liveness {
        let insts = &bbs.1[bb.idx()];
        for (node, live) in zip(insts, liveness.into_iter().skip(1)) {
            for live_node in live {
-                if *node != live_node {
-                    edges.push((*node, live_node));
+                for live_node in disjoint_sets[&union_find.find(live_node.idx())].iter() {
+                    if *node != *live_node {
+                        edges.push((*node, *live_node));
+                    }
                }
            }
        }
    }

-    // Step 3: filter edges (A, B) to just see edges where A uses B and A isn't
-    // a terminating read. These are the edges that may require a spill.
+    // Step 3: filter edges (A, B) to just see edges where A uses B and A
+    // mutates B. These are the edges that may require a spill.
    let mut spill_edges = edges.into_iter().filter(|(a, b)| {
-        get_uses(&editor.func().nodes[a.idx()])
-            .as_ref()
-            .into_iter()
-            .any(|u| *u == *b)
-            && !terminating_reads(editor.func(), editor.func_id(), *a, objects).any(|id| id == *b)
+        mutating_writes(editor.func(), *a, objects).any(|id| id == *b)
+            || (get_uses(&editor.func().nodes[a.idx()])
+                .as_ref()
+                .into_iter()
+                .any(|u| *u == *b)
+                && (editor.func().nodes[a.idx()].is_phi()
+                    || editor.func().nodes[a.idx()].is_reduce()))
    });

    // Step 4: if there is a spill edge, spill it and return true. Otherwise,

--- a/hercules_opt/src/inline.rs
+++ b/hercules_opt/src/inline.rs
@@ -43,9 +43,6 @@ pub fn inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) {
    // Step 4: run inlining on each function individually. Iterate the functions
    // in topological order.
    for to_inline_id in topo {
-        if editors[to_inline_id.idx()].func().entry {
-            continue;
-        }
        // Since Rust cannot analyze the accesses into an array of mutable
        // references, we need to do some weirdness here to simultaneously get:
        // 1. A mutable reference to the function we're modifying.

--- a/hercules_opt/src/interprocedural_sroa.rs
+++ b/hercules_opt/src/interprocedural_sroa.rs
@@ -319,7 +319,8 @@ fn compress_return_products(editors: &mut Vec<FunctionEditor>, all_callsites_edi
                let old_dcs = dc_param_idx_to_dc_id[..new_dcs.len()].to_vec().clone();
                let mut substituted = old_return_type_ids[function_id.idx()];

-                let first_dc = edit.num_dynamic_constants() + 1;
+                assert_eq!(old_dcs.len(), new_dcs.len());
+                let first_dc = edit.num_dynamic_constants() + 100;
                for (dc_a, dc_n) in zip(old_dcs, first_dc..) {
                    substituted = substitute_dynamic_constants_in_type(
                        dc_a,
@@ -416,12 +417,37 @@ fn remove_return_singletons(editors: &mut Vec<FunctionEditor>, all_callsites_edi
            .collect();

        for call_node_id in call_node_ids {
-            let (_, function, _, _) = editor.func().nodes[call_node_id.idx()].try_call().unwrap();
+            let (_, function, dc_args, _) =
+                editor.func().nodes[call_node_id.idx()].try_call().unwrap();
+            let dc_args = dc_args.clone();

            if singleton_removed[function.idx()] {
                let edit_successful = editor.edit(|mut edit| {
-                    let empty_constant_id =
-                        edit.add_zero_constant(old_return_type_ids[function.idx()]);
+                    let mut substituted = old_return_type_ids[function.idx()];
+                    let first_dc = edit.num_dynamic_constants() + 100;
+                    let dc_params: Vec<_> = (0..dc_args.len())
+                        .map(|param_idx| {
+                            edit.add_dynamic_constant(DynamicConstant::Parameter(param_idx))
+                        })
+                        .collect();
+                    for (dc_a, dc_n) in zip(dc_params, first_dc..) {
+                        substituted = substitute_dynamic_constants_in_type(
+                            dc_a,
+                            DynamicConstantID::new(dc_n),
+                            substituted,
+                            &mut edit,
+                        );
+                    }
+
+                    for (dc_n, dc_b) in zip(first_dc.., dc_args.iter()) {
+                        substituted = substitute_dynamic_constants_in_type(
+                            DynamicConstantID::new(dc_n),
+                            *dc_b,
+                            substituted,
+                            &mut edit,
+                        );
+                    }
+                    let empty_constant_id = edit.add_zero_constant(substituted);
                    let empty_node_id = edit.add_node(Node::Constant {
                        id: empty_constant_id,
                    });

--- a/hercules_opt/src/lib.rs
+++ b/hercules_opt/src/lib.rs
@@ -17,6 +17,7 @@ pub mod pass;
 pub mod phi_elim;
 pub mod pred;
 pub mod schedule;
+pub mod slf;
 pub mod sroa;
 pub mod unforkify;
 pub mod utils;
@@ -38,6 +39,7 @@ pub use crate::pass::*;
 pub use crate::phi_elim::*;
 pub use crate::pred::*;
 pub use crate::schedule::*;
+pub use crate::slf::*;
 pub use crate::sroa::*;
 pub use crate::unforkify::*;
 pub use crate::utils::*;
--- a/hercules_opt/src/pass.rs
+++ b/hercules_opt/src/pass.rs
@@ -25,6 +25,8 @@ pub enum Pass {
    PhiElim,
    Forkify,
    ForkGuardElim,
+    SLF,
+    WritePredication,
    Predication,
    SROA,
    Inline,
@@ -469,27 +471,90 @@ impl PassManager {
                    }
                    self.clear_analyses();
                }
-                Pass::Predication => {
+                Pass::SLF => {
                    self.make_def_uses();
                    self.make_reverse_postorders();
-                    self.make_doms();
-                    self.make_fork_join_maps();
+                    self.make_typing();
                    let def_uses = self.def_uses.as_ref().unwrap();
                    let reverse_postorders = self.reverse_postorders.as_ref().unwrap();
-                    let doms = self.doms.as_ref().unwrap();
-                    let fork_join_maps = self.fork_join_maps.as_ref().unwrap();
+                    let typing = self.typing.as_ref().unwrap();
                    for idx in 0..self.module.functions.len() {
-                        predication(
+                        let constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.constants));
+                        let dynamic_constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
+                        let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
+                        let mut editor = FunctionEditor::new(
                            &mut self.module.functions[idx],
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
                            &def_uses[idx],
-                            &reverse_postorders[idx],
-                            &doms[idx],
-                            &fork_join_maps[idx],
                        );
-                        let num_nodes = self.module.functions[idx].nodes.len();
-                        self.module.functions[idx]
-                            .schedules
-                            .resize(num_nodes, vec![]);
+                        slf(&mut editor, &reverse_postorders[idx], &typing[idx]);
+
+                        self.module.constants = constants_ref.take();
+                        self.module.dynamic_constants = dynamic_constants_ref.take();
+                        self.module.types = types_ref.take();
+
+                        println!("{}", self.module.functions[idx].name);
+                        self.module.functions[idx].delete_gravestones();
+                    }
+                    self.clear_analyses();
+                }
+                Pass::WritePredication => {
+                    self.make_def_uses();
+                    let def_uses = self.def_uses.as_ref().unwrap();
+                    for idx in 0..self.module.functions.len() {
+                        let constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.constants));
+                        let dynamic_constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
+                        let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
+                        let mut editor = FunctionEditor::new(
+                            &mut self.module.functions[idx],
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            &def_uses[idx],
+                        );
+                        write_predication(&mut editor);
+
+                        self.module.constants = constants_ref.take();
+                        self.module.dynamic_constants = dynamic_constants_ref.take();
+                        self.module.types = types_ref.take();
+
+                        self.module.functions[idx].delete_gravestones();
+                    }
+                    self.clear_analyses();
+                }
+                Pass::Predication => {
+                    self.make_def_uses();
+                    self.make_typing();
+                    let def_uses = self.def_uses.as_ref().unwrap();
+                    let typing = self.typing.as_ref().unwrap();
+                    for idx in 0..self.module.functions.len() {
+                        let constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.constants));
+                        let dynamic_constants_ref =
+                            RefCell::new(std::mem::take(&mut self.module.dynamic_constants));
+                        let types_ref = RefCell::new(std::mem::take(&mut self.module.types));
+                        let mut editor = FunctionEditor::new(
+                            &mut self.module.functions[idx],
+                            FunctionID::new(idx),
+                            &constants_ref,
+                            &dynamic_constants_ref,
+                            &types_ref,
+                            &def_uses[idx],
+                        );
+                        predication(&mut editor, &typing[idx]);
+
+                        self.module.constants = constants_ref.take();
+                        self.module.dynamic_constants = dynamic_constants_ref.take();
+                        self.module.types = types_ref.take();
+
                        self.module.functions[idx].delete_gravestones();
                    }
                    self.clear_analyses();
@@ -1002,7 +1067,7 @@ impl PassManager {
                        .expect("PANIC: Unable to write output module file contents.");
                }
            }
-            println!("Ran pass: {:?}", pass);
+            eprintln!("Ran pass: {:?}", pass);
        }
    }


--- a/hercules_opt/src/pred.rs
+++ b/hercules_opt/src/pred.rs
--- a/hercules_opt/src/slf.rs
+++ b/hercules_opt/src/slf.rs
+use std::collections::BTreeMap;
+
+use hercules_ir::*;
+
+use crate::*;
+
+/*
+ * The SLF lattice tracks what sub-values of a collection are known. Each sub-
+ * value is a node ID at a set of indices that were written at. A write to a set
+ * of indices that structurally maps a previous sub-value removes the old sub-
+ * value, since that write may overwrite the old known sub-value. The lattice
+ * top corresponds to every value is 0. When the sub-values at a set of indices
+ * are not known, the `subvalues` map stores `None` for the known value. When a
+ * write involves array positions, remove sub-values that are clobbered and
+ * insert an indices set with an empty positions list and a `None` value.
+ */
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SLFLattice {
+    subvalues: BTreeMap<Box<[Index]>, Option<NodeID>>,
+}
+
+impl Semilattice for SLFLattice {
+    fn meet(a: &Self, b: &Self) -> Self {
+        // Merge the two maps. Find equal indices sets between `a` and `b` and
+        // keep their known sub-value if they're equivalent. All other indices
+        // sets in `a` or `b` map to `None`.
+        let mut ret = BTreeMap::new();
+        for (indices, a_subvalue) in &a.subvalues {
+            if let Some(b_subvalue) = b.subvalues.get(indices)
+                && a_subvalue == b_subvalue
+            {
+                // If both maps have the same sub-value for this set of indices,
+                // add it unmolested to the meet lattice value.
+                ret.insert(indices.clone(), *a_subvalue);
+            } else {
+                // If not both maps have a write at the same set of indices or
+                // if the writes don't match, then we don't know what's been
+                // written there.
+                ret.insert(indices.clone(), None);
+            }
+        }
+        for (indices, _) in &b.subvalues {
+            // Any indices sets in `b` that aren't in `ret` are indices sets
+            // that aren't in `a`, so the sub-value isn't known.
+            ret.entry(indices.clone()).or_insert(None);
+        }
+        SLFLattice { subvalues: ret }
+    }
+
+    fn top() -> Self {
+        SLFLattice {
+            subvalues: BTreeMap::new(),
+        }
+    }
+
+    fn bottom() -> Self {
+        let mut subvalues = BTreeMap::new();
+        // The empty indices set overlaps with all possible indices sets.
+        subvalues.insert(Box::new([]) as Box<[Index]>, None);
+        SLFLattice { subvalues }
+    }
+}
+
+/*
+ * Top level function to run store-to-load forwarding on a function. Looks for
+ * known values inside collections and replaces reads of those values with the
+ * values directly.
+ */
+pub fn slf(editor: &mut FunctionEditor, reverse_postorder: &Vec<NodeID>, typing: &Vec<TypeID>) {
+    // First, run a dataflow analysis that looks at known values inside
+    // collections. Thanks to the value semantics of Hercules IR, this analysis
+    // is relatively simple and straightforward.
+    let func = editor.func();
+    let lattice = forward_dataflow(func, reverse_postorder, |inputs, id| {
+        match func.nodes[id.idx()] {
+            Node::Phi {
+                control: _,
+                data: _,
+            }
+            | Node::Reduce {
+                control: _,
+                init: _,
+                reduct: _,
+            }
+            | Node::Ternary {
+                op: TernaryOperator::Select,
+                first: _,
+                second: _,
+                third: _,
+            } => inputs.into_iter().fold(SLFLattice::top(), |acc, input| {
+                SLFLattice::meet(&acc, input)
+            }),
+            Node::Write {
+                collect: _,
+                data,
+                ref indices,
+            } => {
+                // Start with the indices of the `collect` input.
+                let mut value = inputs[0].clone();
+
+                // Any indices sets that overlap with `indices` become `None`,
+                // since we no longer know what's stored there.
+                for (other_indices, subvalue) in value.subvalues.iter_mut() {
+                    if indices_may_overlap(other_indices, indices) {
+                        *subvalue = None;
+                    }
+                }
+
+                // Track `data` at `indices`.
+                value.subvalues.insert(indices.clone(), Some(data));
+
+                value
+            }
+            _ => SLFLattice::bottom(),
+        }
+    });
+
+    // Second, look for reads where the indices set either:
+    // 1. Equal the indices of a known sub-value. Then, the read can be replaced
+    //    by the known sub-value.
+    // 2. Otherwise, if the indices set doesn't overlap with any known or
+    //    unknown sub-value, then the read can be replaced by a zero constant.
+    // 3. Otherwise, the read can't be replaced.
+    // Keep track of which nodes we've already replaced, since a sub-value we
+    // knew previously may be the ID of an old node replaced previously.
+    let mut replacements = BTreeMap::new();
+    for id in editor.node_ids() {
+        let Node::Read {
+            collect,
+            ref indices,
+        } = editor.func().nodes[id.idx()]
+        else {
+            continue;
+        };
+        let subvalues = &lattice[collect.idx()].subvalues;
+
+        if let Some(sub_value) = subvalues.get(indices)
+            && let Some(mut known) = *sub_value
+        {
+            while let Some(replacement) = replacements.get(&known) {
+                known = *replacement;
+            }
+            editor.edit(|mut edit| {
+                edit = edit.replace_all_uses(id, known)?;
+                edit.delete_node(id)
+            });
+            replacements.insert(id, known);
+        } else if !subvalues
+            .keys()
+            .any(|other_indices| indices_may_overlap(other_indices, indices))
+        {
+            editor.edit(|mut edit| {
+                let zero = edit.add_zero_constant(typing[id.idx()]);
+                let zero = edit.add_node(Node::Constant { id: zero });
+                edit = edit.replace_all_uses(id, zero)?;
+                edit.delete_node(id)
+            });
+        }
+    }
+}
--- a/hercules_opt/src/utils.rs
+++ b/hercules_opt/src/utils.rs
+use std::iter::zip;
+
 use hercules_ir::def_use::*;
 use hercules_ir::ir::*;

@@ -130,6 +132,24 @@ pub(crate) fn substitute_dynamic_constants(
                dc_c
            }
        }
+        DynamicConstant::Min(left, right) => {
+            let new_left = substitute_dynamic_constants(dc_a, dc_b, left, edit);
+            let new_right = substitute_dynamic_constants(dc_a, dc_b, right, edit);
+            if new_left != left || new_right != right {
+                edit.add_dynamic_constant(DynamicConstant::Min(new_left, new_right))
+            } else {
+                dc_c
+            }
+        }
+        DynamicConstant::Max(left, right) => {
+            let new_left = substitute_dynamic_constants(dc_a, dc_b, left, edit);
+            let new_right = substitute_dynamic_constants(dc_a, dc_b, right, edit);
+            if new_left != left || new_right != right {
+                edit.add_dynamic_constant(DynamicConstant::Max(new_left, new_right))
+            } else {
+                dc_c
+            }
+        }
    }
 }

@@ -223,7 +243,7 @@ pub(crate) fn substitute_dynamic_constants_in_node(
 /*
 * Top level function to make a function have only a single return.
 */
-pub fn collapse_returns(editor: &mut FunctionEditor) -> Option<NodeID> {
+pub(crate) fn collapse_returns(editor: &mut FunctionEditor) -> Option<NodeID> {
    let returns: Vec<NodeID> = (0..editor.func().nodes.len())
        .filter(|idx| editor.func().nodes[*idx].is_return())
        .map(NodeID::new)
@@ -263,7 +283,7 @@ pub fn collapse_returns(editor: &mut FunctionEditor) -> Option<NodeID> {
    new_return
 }

-pub fn contains_between_control_flow(func: &Function) -> bool {
+pub(crate) fn contains_between_control_flow(func: &Function) -> bool {
    let num_control = func.nodes.iter().filter(|node| node.is_control()).count();
    assert!(num_control >= 2, "PANIC: A Hercules function must have at least two control nodes: a start node and at least one return node.");
    num_control > 2
@@ -273,7 +293,7 @@ pub fn contains_between_control_flow(func: &Function) -> bool {
 * Top level function to ensure a Hercules function contains at least one
 * control node that isn't the start or return nodes.
 */
-pub fn ensure_between_control_flow(editor: &mut FunctionEditor) -> Option<NodeID> {
+pub(crate) fn ensure_between_control_flow(editor: &mut FunctionEditor) -> Option<NodeID> {
    if !contains_between_control_flow(editor.func()) {
        let ret = editor
            .node_ids()
@@ -308,3 +328,51 @@ pub fn ensure_between_control_flow(editor: &mut FunctionEditor) -> Option<NodeID
        )
    }
 }
+
+/*
+ * Helper function to tell if two lists of indices have the same structure.
+ */
+pub(crate) fn indices_structurally_equivalent(indices1: &[Index], indices2: &[Index]) -> bool {
+    if indices1.len() == indices2.len() {
+        let mut equiv = true;
+        for pair in zip(indices1, indices2) {
+            equiv = equiv
+                && match pair {
+                    (Index::Field(idx1), Index::Field(idx2)) => idx1 == idx2,
+                    (Index::Variant(idx1), Index::Variant(idx2)) => idx1 == idx2,
+                    (Index::Position(ref pos1), Index::Position(ref pos2)) => {
+                        assert_eq!(pos1.len(), pos2.len());
+                        true
+                    }
+                    _ => false,
+                };
+        }
+        equiv
+    } else {
+        false
+    }
+}
+
+/*
+ * Helper function to determine if two lists of indices may overlap.
+ */
+pub(crate) fn indices_may_overlap(indices1: &[Index], indices2: &[Index]) -> bool {
+    for pair in zip(indices1, indices2) {
+        match pair {
+            // Check that the field numbers are the same.
+            (Index::Field(idx1), Index::Field(idx2)) => {
+                if idx1 != idx2 {
+                    return false;
+                }
+            }
+            // Variant indices always may overlap, since it's the same
+            // underlying memory. Position indices always may overlap, since the
+            // indexing nodes may be the same at runtime.
+            (Index::Variant(_), Index::Variant(_)) | (Index::Position(_), Index::Position(_)) => {}
+            _ => panic!(),
+        }
+    }
+    // `zip` will exit as soon as either iterator is done - two sets of indices
+    // may overlap when one indexes a larger sub-value than the other.
+    true
+}
--- a/hercules_rt/Cargo.toml
+++ b/hercules_rt/Cargo.toml
@@ -4,5 +4,8 @@ version = "0.1.0"
 authors = ["Russel Arbore <rarbore2@illinois.edu>"]
 edition = "2021"

+[features]
+cuda = []
+
 [dependencies]

--- a/hercules_rt/build.rs
+++ b/hercules_rt/build.rs
+use std::env::var;
+use std::path::Path;
+use std::process::Command;
+
+fn main() {
+    if cfg!(feature = "cuda") {
+        let out_dir = var("OUT_DIR").unwrap();
+        Command::new("nvcc")
+            .args(&["src/rtdefs.cu", "-c", "-o"])
+            .arg(&format!("{}/rtdefs.o", out_dir))
+            .status()
+            .expect("PANIC: NVCC failed when building runtime. Is NVCC installed?");
+        Command::new("ar")
+            .args(&["crus", "librtdefs.a", "rtdefs.o"])
+            .current_dir(&Path::new(&out_dir))
+            .status()
+            .unwrap();
+
+        println!("cargo::rustc-link-search=native={}", out_dir);
+        println!("cargo::rustc-link-search=native=/usr/lib/x86_64-linux-gnu/");
+        println!("cargo::rustc-link-lib=static=rtdefs");
+        println!("cargo::rustc-link-lib=cudart");
+        println!("cargo::rerun-if-changed=src/rtdefs.cu");
+    }
+}
--- a/hercules_rt/src/lib.rs
+++ b/hercules_rt/src/lib.rs
@@ -4,6 +4,16 @@ use std::mem::swap;
 use std::ptr::{copy_nonoverlapping, NonNull};
 use std::slice::from_raw_parts;

+#[cfg(feature = "cuda")]
+extern "C" {
+    fn cuda_alloc(size: usize) -> *mut u8;
+    fn cuda_alloc_zeroed(size: usize) -> *mut u8;
+    fn cuda_dealloc(ptr: *mut u8);
+    fn copy_cpu_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cpu(dst: *mut u8, src: *mut u8, size: usize);
+    fn copy_cuda_to_cuda(dst: *mut u8, src: *mut u8, size: usize);
+}
+
 /*
 * An in-memory collection object that can be used by functions compiled by the
 * Hercules compiler.
@@ -13,16 +23,23 @@ pub struct HerculesBox<'a> {
    cpu_exclusive: Option<NonNull<u8>>,
    cpu_owned: Option<NonNull<u8>>,

+    #[cfg(feature = "cuda")]
+    cuda_owned: Option<NonNull<u8>>,
+
    size: usize,
    _phantom: PhantomData<&'a u8>,
 }

-impl<'a> HerculesBox<'a> {
+impl<'b, 'a: 'b> HerculesBox<'a> {
    pub fn from_slice<T>(slice: &'a [T]) -> Self {
        HerculesBox {
            cpu_shared: Some(unsafe { NonNull::new_unchecked(slice.as_ptr() as *mut u8) }),
            cpu_exclusive: None,
            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: slice.len() * size_of::<T>(),
            _phantom: PhantomData,
        }
@@ -33,36 +50,69 @@ impl<'a> HerculesBox<'a> {
            cpu_shared: None,
            cpu_exclusive: Some(unsafe { NonNull::new_unchecked(slice.as_mut_ptr() as *mut u8) }),
            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: slice.len() * size_of::<T>(),
            _phantom: PhantomData,
        }
    }

-    pub fn as_slice<T>(&'a self) -> &'a [T] {
+    pub fn as_slice<T>(&'b mut self) -> &'b [T] {
        assert_eq!(self.size % size_of::<T>(), 0);
        unsafe { from_raw_parts(self.__cpu_ptr() as *const T, self.size / size_of::<T>()) }
    }

-    unsafe fn into_cpu(&self) -> NonNull<u8> {
-        self.cpu_shared
-            .or(self.cpu_exclusive)
-            .or(self.cpu_owned)
-            .unwrap()
+    unsafe fn get_cpu_ptr(&self) -> Option<NonNull<u8>> {
+        self.cpu_owned.or(self.cpu_exclusive).or(self.cpu_shared)
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn get_cuda_ptr(&self) -> Option<NonNull<u8>> {
+        self.cuda_owned
    }

-    unsafe fn into_cpu_mut(&mut self) -> NonNull<u8> {
-        if let Some(ptr) = self.cpu_exclusive.or(self.cpu_owned) {
+    unsafe fn allocate_cpu(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cpu_owned {
            ptr
        } else {
            let ptr =
                NonNull::new(alloc(Layout::from_size_align_unchecked(self.size, 16))).unwrap();
-            copy_nonoverlapping(self.cpu_shared.unwrap().as_ptr(), ptr.as_ptr(), self.size);
            self.cpu_owned = Some(ptr);
-            self.cpu_shared = None;
            ptr
        }
    }

+    #[cfg(feature = "cuda")]
+    unsafe fn allocate_cuda(&mut self) -> NonNull<u8> {
+        if let Some(ptr) = self.cuda_owned {
+            ptr
+        } else {
+            let ptr = cuda_alloc(self.size);
+            self.cuda_owned = Some(NonNull::new(ptr).unwrap());
+            self.cuda_owned.unwrap()
+        }
+    }
+
+    unsafe fn deallocate_cpu(&mut self) {
+        if let Some(ptr) = self.cpu_owned {
+            dealloc(
+                ptr.as_ptr(),
+                Layout::from_size_align_unchecked(self.size, 16),
+            );
+            self.cpu_owned = None;
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    unsafe fn deallocate_cuda(&mut self) {
+        if let Some(ptr) = self.cuda_owned {
+            cuda_dealloc(ptr.as_ptr());
+            self.cuda_owned = None;
+        }
+    }
+
    pub unsafe fn __zeros(size: u64) -> Self {
        assert_ne!(size, 0);
        let size = size as usize;
@@ -72,6 +122,10 @@ impl<'a> HerculesBox<'a> {
            cpu_owned: Some(
                NonNull::new(alloc_zeroed(Layout::from_size_align_unchecked(size, 16))).unwrap(),
            ),
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: size,
            _phantom: PhantomData,
        }
@@ -82,6 +136,10 @@ impl<'a> HerculesBox<'a> {
            cpu_shared: None,
            cpu_exclusive: None,
            cpu_owned: None,
+
+            #[cfg(feature = "cuda")]
+            cuda_owned: None,
+
            size: 0,
            _phantom: PhantomData,
        }
@@ -93,24 +151,61 @@ impl<'a> HerculesBox<'a> {
        ret
    }

-    pub unsafe fn __cpu_ptr(&self) -> *mut u8 {
-        self.into_cpu().as_ptr()
+    pub unsafe fn __cpu_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cpu_ptr() {
+            return ptr.as_ptr();
+        }
+        #[cfg(feature = "cuda")]
+        {
+            let cuda_ptr = self.get_cuda_ptr().unwrap();
+            let cpu_ptr = self.allocate_cpu();
+            copy_cuda_to_cpu(cpu_ptr.as_ptr(), cuda_ptr.as_ptr(), self.size);
+            return cpu_ptr.as_ptr();
+        }
+        panic!()
    }

    pub unsafe fn __cpu_ptr_mut(&mut self) -> *mut u8 {
-        self.into_cpu_mut().as_ptr()
+        let cpu_ptr = self.__cpu_ptr();
+        if Some(cpu_ptr) == self.cpu_shared.map(|nn| nn.as_ptr()) {
+            self.allocate_cpu();
+            copy_nonoverlapping(cpu_ptr, self.cpu_owned.unwrap().as_ptr(), self.size);
+        }
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        #[cfg(feature = "cuda")]
+        self.deallocate_cuda();
+        cpu_ptr
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr(&mut self) -> *mut u8 {
+        if let Some(ptr) = self.get_cuda_ptr() {
+            ptr.as_ptr()
+        } else {
+            let cpu_ptr = self.get_cpu_ptr().unwrap();
+            let cuda_ptr = self.allocate_cuda();
+            copy_cpu_to_cuda(cuda_ptr.as_ptr(), cpu_ptr.as_ptr(), self.size);
+            cuda_ptr.as_ptr()
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub unsafe fn __cuda_ptr_mut(&mut self) -> *mut u8 {
+        let cuda_ptr = self.__cuda_ptr();
+        self.cpu_shared = None;
+        self.cpu_exclusive = None;
+        self.deallocate_cpu();
+        cuda_ptr
    }
 }

 impl<'a> Drop for HerculesBox<'a> {
    fn drop(&mut self) {
-        if let Some(ptr) = self.cpu_owned {
-            unsafe {
-                dealloc(
-                    ptr.as_ptr(),
-                    Layout::from_size_align_unchecked(self.size, 16),
-                )
-            }
+        unsafe {
+            self.deallocate_cpu();
+            #[cfg(feature = "cuda")]
+            self.deallocate_cuda();
        }
    }
 }
No results found