diff --git a/hercules_ir/src/ir.rs b/hercules_ir/src/ir.rs index 5dfe2915f5f3e30f56b6665dc27d23cd40cca3d4..f6aafa35bd2c2d94324e63b2b91213ad0c2e9c4f 100644 --- a/hercules_ir/src/ir.rs +++ b/hercules_ir/src/ir.rs @@ -1048,9 +1048,20 @@ impl Constant { } } - /* - * Useful for GVN. - */ + pub fn is_false(&self) -> bool { + match self { + Constant::Boolean(false) => true, + _ => false, + } + } + + pub fn is_true(&self) -> bool { + match self { + Constant::Boolean(true) => true, + _ => false, + } + } + pub fn is_zero(&self) -> bool { match self { Constant::Integer8(0) => true, diff --git a/hercules_opt/src/inline.rs b/hercules_opt/src/inline.rs index 38ed1b22d2d81be971aa867857fd01e3752ecc0a..9b0a9200b6301a8928f5fda2f70b515fc1d45dde 100644 --- a/hercules_opt/src/inline.rs +++ b/hercules_opt/src/inline.rs @@ -307,7 +307,11 @@ impl ParameterLattice { * These functions can have that constant "inlined" - the parameter is removed * and all uses of the parameter becomes uses of the constant directly. */ -pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { +pub fn const_inline( + editors: &mut [FunctionEditor], + callgraph: &CallGraph, + inline_collections: bool, +) { // Run const inlining on each function, starting at the most shallow // function first, since we want to propagate constants down the call graph. for func_id in callgraph.topo().into_iter().rev() { @@ -361,22 +365,29 @@ pub fn const_inline(editors: &mut [FunctionEditor], callgraph: &CallGraph) { let mut param_tys = edit.get_param_types().clone(); let mut decrement_index_by = 0; for idx in 0..param_tys.len() { - if let Some(node) = match param_lattice[idx] { - ParameterLattice::Top => Some(Node::Undef { ty: param_tys[idx] }), - ParameterLattice::Constant(id) => Some(Node::Constant { id }), - ParameterLattice::DynamicConstant(id, _) => { - // Rust moment. - let maybe_cons = edit.get_dynamic_constant(id).try_constant(); - if let Some(val) = maybe_cons { - Some(Node::DynamicConstant { - id: edit.add_dynamic_constant(DynamicConstant::Constant(val)), - }) - } else { - None + if (inline_collections + || edit + .get_type(param_tys[idx - decrement_index_by]) + .is_primitive()) + && let Some(node) = match param_lattice[idx] { + ParameterLattice::Top => Some(Node::Undef { + ty: param_tys[idx - decrement_index_by], + }), + ParameterLattice::Constant(id) => Some(Node::Constant { id }), + ParameterLattice::DynamicConstant(id, _) => { + // Rust moment. + let maybe_cons = edit.get_dynamic_constant(id).try_constant(); + if let Some(val) = maybe_cons { + Some(Node::DynamicConstant { + id: edit.add_dynamic_constant(DynamicConstant::Constant(val)), + }) + } else { + None + } } + _ => None, } - _ => None, - } && let Some(ids) = param_idx_to_ids.get(&idx) + && let Some(ids) = param_idx_to_ids.get(&idx) { let node = edit.add_node(node); for id in ids { diff --git a/hercules_opt/src/pred.rs b/hercules_opt/src/pred.rs index ed7c3a855b016608aa194cc9f2cd89f05d836bde..8f1d07454262a621d8f9c753a3c68b01752b3dc7 100644 --- a/hercules_opt/src/pred.rs +++ b/hercules_opt/src/pred.rs @@ -136,6 +136,77 @@ pub fn predication(editor: &mut FunctionEditor, typing: &Vec<TypeID>) { bad_branches.insert(branch); } } + + // Do a quick and dirty rewrite to convert select(a, b, false) to a && b and + // select(a, b, true) to a || b. + for id in editor.node_ids() { + let nodes = &editor.func().nodes; + if let Node::Ternary { + op: TernaryOperator::Select, + first, + second, + third, + } = nodes[id.idx()] + { + if let Some(cons) = nodes[second.idx()].try_constant() + && editor.get_constant(cons).is_false() + { + editor.edit(|mut edit| { + let inv = edit.add_node(Node::Unary { + op: UnaryOperator::Not, + input: first, + }); + let node = edit.add_node(Node::Binary { + op: BinaryOperator::And, + left: inv, + right: third, + }); + edit = edit.replace_all_uses(id, node)?; + edit.delete_node(id) + }); + } else if let Some(cons) = nodes[third.idx()].try_constant() + && editor.get_constant(cons).is_false() + { + editor.edit(|mut edit| { + let node = edit.add_node(Node::Binary { + op: BinaryOperator::And, + left: first, + right: second, + }); + edit = edit.replace_all_uses(id, node)?; + edit.delete_node(id) + }); + } else if let Some(cons) = nodes[second.idx()].try_constant() + && editor.get_constant(cons).is_true() + { + editor.edit(|mut edit| { + let node = edit.add_node(Node::Binary { + op: BinaryOperator::Or, + left: first, + right: third, + }); + edit = edit.replace_all_uses(id, node)?; + edit.delete_node(id) + }); + } else if let Some(cons) = nodes[third.idx()].try_constant() + && editor.get_constant(cons).is_true() + { + editor.edit(|mut edit| { + let inv = edit.add_node(Node::Unary { + op: UnaryOperator::Not, + input: first, + }); + let node = edit.add_node(Node::Binary { + op: BinaryOperator::Or, + left: inv, + right: second, + }); + edit = edit.replace_all_uses(id, node)?; + edit.delete_node(id) + }); + } + } + } } /* diff --git a/hercules_opt/src/schedule.rs b/hercules_opt/src/schedule.rs index d7ae40488d75a1da7ef65b8a53a894bc0f62cded..9bc7823ee7f5837cf49387170e548a9174340f42 100644 --- a/hercules_opt/src/schedule.rs +++ b/hercules_opt/src/schedule.rs @@ -69,6 +69,26 @@ pub fn infer_parallel_reduce( chain_id = reduct; } + // If the use is a phi that uses the reduce and a write, then we might + // want to parallelize this still. Set the chain ID to the write. + if let Node::Phi { + control: _, + ref data, + } = func.nodes[chain_id.idx()] + && data.len() + == data + .into_iter() + .filter(|phi_use| **phi_use == last_reduce) + .count() + + 1 + { + chain_id = *data + .into_iter() + .filter(|phi_use| **phi_use != last_reduce) + .next() + .unwrap(); + } + // Check for a Write-Reduce tight cycle. if let Node::Write { collect, @@ -130,12 +150,13 @@ pub fn infer_monoid_reduce( reduce_cycles: &HashMap<NodeID, HashSet<NodeID>>, ) { let is_binop_monoid = |op| { - matches!( - op, - BinaryOperator::Add | BinaryOperator::Mul | BinaryOperator::Or | BinaryOperator::And - ) + op == BinaryOperator::Add + || op == BinaryOperator::Mul + || op == BinaryOperator::Or + || op == BinaryOperator::And }; - let is_intrinsic_monoid = |intrinsic| matches!(intrinsic, Intrinsic::Max | Intrinsic::Min); + let is_intrinsic_monoid = + |intrinsic| intrinsic == Intrinsic::Max || intrinsic == Intrinsic::Min; for id in editor.node_ids() { let func = editor.func(); diff --git a/juno_samples/rodinia/bfs/src/bfs.jn b/juno_samples/rodinia/bfs/src/bfs.jn index 51dcd945429dfde02cb2313afa404e81f8722c84..2534a89c627f137bf4a65a7f3d61879c3d3670e6 100644 --- a/juno_samples/rodinia/bfs/src/bfs.jn +++ b/juno_samples/rodinia/bfs/src/bfs.jn @@ -43,10 +43,10 @@ fn bfs<n, m: usize>(graph_nodes: Node[n], source: u32, edges: u32[m]) -> i32[n] } @loop2 for i in 0..n { + stop = stop && !updated[i]; if updated[i] { mask[i] = true; visited[i] = true; - stop = false; updated[i] = false; } } diff --git a/juno_samples/rodinia/bfs/src/cpu.sch b/juno_samples/rodinia/bfs/src/cpu.sch index 44cfa8ad0161fac0afbccc2d383637ec8a2f1aa0..ae67fdd987e961a95311a7d3aaa0f94fe31f1687 100644 --- a/juno_samples/rodinia/bfs/src/cpu.sch +++ b/juno_samples/rodinia/bfs/src/cpu.sch @@ -23,7 +23,8 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +predication(*); +simpl!(*); unforkify(*); - gcm(*); diff --git a/juno_samples/rodinia/cfd/src/cpu_euler.sch b/juno_samples/rodinia/cfd/src/cpu_euler.sch index 5fe48a8395cfb6fada1d668b4f73fa6eb3487f5e..1244f80e54fdad43f58e5c5a5af44646b7a83e89 100644 --- a/juno_samples/rodinia/cfd/src/cpu_euler.sch +++ b/juno_samples/rodinia/cfd/src/cpu_euler.sch @@ -24,7 +24,8 @@ fixpoint { fork-guard-elim(*); } simpl!(*); +no-memset(compute_step_factor@res, compute_flux@res, copy_vars@res); +parallel-reduce(time_step, copy_vars, compute_flux@outer_loop \ compute_flux@inner_loop); unforkify(*); - gcm(*); diff --git a/juno_samples/rodinia/cfd/src/euler.jn b/juno_samples/rodinia/cfd/src/euler.jn index 203cfd96008237f57ec276973d70304e56159682..6966f5ba0887388cf02bafcf80ed66e4059b8b7d 100644 --- a/juno_samples/rodinia/cfd/src/euler.jn +++ b/juno_samples/rodinia/cfd/src/euler.jn @@ -47,7 +47,7 @@ fn compute_speed_of_sound(density: f32, pressure: f32) -> f32 { } fn compute_step_factor<nelr: usize>(variables: Variables::<nelr>, areas: f32[nelr]) -> f32[nelr] { - let step_factors : f32[nelr]; + @res let step_factors : f32[nelr]; for i in 0..nelr { let density = variables.density[i]; @@ -106,9 +106,9 @@ fn compute_flux<nelr: usize>( ff_flux_contribution_momentum_z: float3, ) -> Variables::<nelr> { const smoothing_coefficient : f32 = 0.2; - let fluxes: Variables::<nelr>; + @res let fluxes: Variables::<nelr>; - for i in 0..nelr { + @outer_loop for i in 0..nelr { let density_i = variables.density[i]; let momentum_i = float3 { x: variables.momentum.x[i], @@ -131,7 +131,7 @@ fn compute_flux<nelr: usize>( let flux_i_momentum = float3 { x: 0.0, y: 0.0, z: 0.0 }; let flux_i_density_energy : f32 = 0.0; - for j in 0..NNB { + @inner_loop for j in 0..NNB { let nb = elements_surrounding_elements[j, i]; let normal = float3 { x: normals.x[j, i], @@ -249,7 +249,7 @@ fn time_step<nelr: usize>( } fn copy_vars<nelr: usize>(variables: Variables::<nelr>) -> Variables::<nelr> { - let result : Variables::<nelr>; + @res let result : Variables::<nelr>; for i in 0..nelr { result.density[i] = variables.density[i]; diff --git a/juno_samples/rodinia/srad/benches/srad_bench.rs b/juno_samples/rodinia/srad/benches/srad_bench.rs index d327454002a6f9cabe4c40f74098570ea0d22d66..728702d9bcc18405ef291945f81413f49f5715af 100644 --- a/juno_samples/rodinia/srad/benches/srad_bench.rs +++ b/juno_samples/rodinia/srad/benches/srad_bench.rs @@ -13,8 +13,8 @@ fn srad_bench(c: &mut Criterion) { let mut r = runner!(srad); let niter = 100; let lambda = 0.5; - let nrows = 502; - let ncols = 458; + let nrows = 512; + let ncols = 512; let image = "data/image.pgm".to_string(); let Image { image: image_ori, diff --git a/juno_samples/rodinia/srad/src/cpu.sch b/juno_samples/rodinia/srad/src/cpu.sch index 1a81ddad3b55bcf9ffb76660ebdc1069338affd4..2b45e8c956e10cb6af538282df98e32eb35b6b5e 100644 --- a/juno_samples/rodinia/srad/src/cpu.sch +++ b/juno_samples/rodinia/srad/src/cpu.sch @@ -28,6 +28,7 @@ fixpoint { fork-coalesce(*); } simpl!(*); +fork-interchange[0, 1](loop1); fork-split(*); unforkify(*); diff --git a/juno_samples/rodinia/srad/src/gpu.sch b/juno_samples/rodinia/srad/src/gpu.sch index 149d5cd2fd71005ade5cdbb3461e08b3e65ab34f..289548f9e01cdf402a3e1b1057fa52d4029f6173 100644 --- a/juno_samples/rodinia/srad/src/gpu.sch +++ b/juno_samples/rodinia/srad/src/gpu.sch @@ -1,23 +1,57 @@ -gvn(*); -dce(*); +macro simpl!(X) { + ccp(X); + simplify-cfg(X); + lift-dc-math(X); + gvn(X); + phi-elim(X); + dce(X); + infer-schedules(X); +} + phi-elim(*); -dce(*); +let sum_loop = outline(srad@loop1); +let main_loops = outline(srad@loop2 | srad@loop3); +gpu(main_loops, extract, compress); +simpl!(*); +const-inline[true](*); crc(*); -dce(*); slf(*); -dce(*); - -let auto = auto-outline(srad); -gpu(auto.srad); - -inline(auto.srad); -inline(auto.srad); -delete-uncalled(*); +write-predication(*); +simpl!(*); +predication(*); +simpl!(*); +predication(*); +simpl!(*); +fixpoint { + forkify(*); + fork-guard-elim(*); + fork-coalesce(*); +} +simpl!(*); +reduce-slf(*); +simpl!(*); +array-slf(*); +simpl!(*); +slf(*); +simpl!(*); -sroa[false](auto.srad); -dce(*); -float-collections(*); -dce(*); +fork-dim-merge(sum_loop); +simpl!(sum_loop); +fork-tile[32, 0, false, true](sum_loop); +let out = fork-split(sum_loop); +clean-monoid-reduces(sum_loop); +simpl!(sum_loop); +let fission = fork-fission[out.srad_0.fj0](sum_loop); +simpl!(sum_loop); +fork-tile[32, 0, false, true](fission.srad_0.fj_bottom); +let out = fork-split(fission.srad_0.fj_bottom); +clean-monoid-reduces(sum_loop); +simpl!(sum_loop); +let top = outline(fission.srad_0.fj_top); +let bottom = outline(out.srad_0.fj0); +gpu(top, bottom); +ip-sroa(*); +sroa(*); +simpl!(*); gcm(*); - diff --git a/juno_samples/rodinia/srad/src/lib.rs b/juno_samples/rodinia/srad/src/lib.rs index d63660070ff0f61d47057ea00b14b3fb31db6e09..a647b94a5ffc8aad3bab91badc1bd58a305e7e75 100644 --- a/juno_samples/rodinia/srad/src/lib.rs +++ b/juno_samples/rodinia/srad/src/lib.rs @@ -114,7 +114,7 @@ pub fn srad_harness(args: SRADInputs) { .max() .unwrap_or(0); assert!( - max_diff <= 1, + max_diff <= 2, "Verification failed: maximum pixel difference of {} exceeds threshold of 1", max_diff ); diff --git a/juno_samples/rodinia/srad/src/main.rs b/juno_samples/rodinia/srad/src/main.rs index 87d1e7e8504584478f51ac2b9dc20dbc04716c81..20da11e73ef8eb90bcf8fde31ca3fa33c734c582 100644 --- a/juno_samples/rodinia/srad/src/main.rs +++ b/juno_samples/rodinia/srad/src/main.rs @@ -12,8 +12,8 @@ fn srad_test() { srad_harness(SRADInputs { niter: 100, lambda: 0.5, - nrows: 502, - ncols: 458, + nrows: 512, + ncols: 512, image: "data/image.pgm".to_string(), output: None, verify: true, diff --git a/juno_samples/rodinia/srad/src/srad.jn b/juno_samples/rodinia/srad/src/srad.jn index 3e016a99b574c1dcde982e7277a5cbcdc1743c19..6074bf8cb12ccc2ad29c1086d7620b3ef98bcf59 100644 --- a/juno_samples/rodinia/srad/src/srad.jn +++ b/juno_samples/rodinia/srad/src/srad.jn @@ -50,10 +50,10 @@ fn srad<nrows, ncols: usize>( let varROI = (sum2 / nelems as f32) - meanROI * meanROI; let q0sqr = varROI / (meanROI * meanROI); - let dN : f32[ncols, nrows]; - let dS : f32[ncols, nrows]; - let dE : f32[ncols, nrows]; - let dW : f32[ncols, nrows]; + @dirs let dN : f32[ncols, nrows]; + @dirs let dS : f32[ncols, nrows]; + @dirs let dE : f32[ncols, nrows]; + @dirs let dW : f32[ncols, nrows]; let c : f32[ncols, nrows]; diff --git a/juno_scheduler/src/ir.rs b/juno_scheduler/src/ir.rs index a0db884492120a43d0bb8fff89e689746ef1579e..6aa85fe53689cf015497e56850ef0c197ccbdae0 100644 --- a/juno_scheduler/src/ir.rs +++ b/juno_scheduler/src/ir.rs @@ -54,14 +54,15 @@ impl Pass { pub fn is_valid_num_args(&self, num: usize) -> bool { match self { Pass::ArrayToProduct => num == 0 || num == 1, + Pass::ConstInline => num == 0 || num == 1, Pass::ForkChunk => num == 4, Pass::ForkExtend => num == 1, Pass::ForkFissionBufferize => num == 2 || num == 1, Pass::ForkInterchange => num == 2, + Pass::InterproceduralSROA => num == 0 || num == 1, Pass::Print => num == 1, Pass::Rename => num == 1, Pass::SROA => num == 0 || num == 1, - Pass::InterproceduralSROA => num == 0 || num == 1, Pass::Xdot => num == 0 || num == 1, _ => num == 0, } @@ -70,14 +71,15 @@ impl Pass { pub fn valid_arg_nums(&self) -> &'static str { match self { Pass::ArrayToProduct => "0 or 1", + Pass::ConstInline => "0 or 1", Pass::ForkChunk => "4", Pass::ForkExtend => "1", Pass::ForkFissionBufferize => "1 or 2", Pass::ForkInterchange => "2", + Pass::InterproceduralSROA => "0 or 1", Pass::Print => "1", Pass::Rename => "1", Pass::SROA => "0 or 1", - Pass::InterproceduralSROA => "0 or 1", Pass::Xdot => "0 or 1", _ => "0", } diff --git a/juno_scheduler/src/pm.rs b/juno_scheduler/src/pm.rs index e049f985e0db36ae78368b8d33c01d22744fdcc6..70d8e4278169ebdbe9985e00ede161acbe05c24d 100644 --- a/juno_scheduler/src/pm.rs +++ b/juno_scheduler/src/pm.rs @@ -1837,7 +1837,17 @@ fn run_pass( pm.clear_analyses(); } Pass::ConstInline => { - assert!(args.is_empty()); + let inline_collections = match args.get(0) { + Some(Value::Boolean { val }) => *val, + Some(_) => { + return Err(SchedulerError::PassError { + pass: "constInline".to_string(), + error: "expected boolean argument".to_string(), + }); + } + None => true, + }; + pm.make_callgraph(); let callgraph = pm.callgraph.take().unwrap(); @@ -1845,7 +1855,7 @@ fn run_pass( .into_iter() .map(|editor| editor.unwrap()) .collect(); - const_inline(&mut editors, &callgraph); + const_inline(&mut editors, &callgraph, inline_collections); for func in editors { changed |= func.modified();