Merge pull request #10 from matter-labs/optimizations

Optimization using prefetch, available for nightly Rust and feature
matter-labs · Jul 13, 2019 · 1aff83c · 1aff83c
2 parents 86e3f9c + d23b855
commit 1aff83c
Show file tree

Hide file tree

Showing 41 changed files with 5,832 additions and 1,191 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,7 +6,7 @@ homepage = "https://github.com/matter-labs/bellman"
 license = "MIT/Apache-2.0"
 name = "bellman_ce"
 repository = "https://github.com/matter-labs/bellman"
-version = "0.3.0"
+version = "0.3.1"
 edition = "2018"
 
 [lib]
@@ -18,24 +18,27 @@ bit-vec = "0.4.4"
 futures = "0.1"
 cfg-if = "0.1.7"
 
-#pairing = { git = 'https://github.com/matterinc/pairing', tag = "0.16.2" }
-pairing_ce = { version = "0.17.0" }
+#pairing = {package = "pairing_ce", path = "../pairing" }
+pairing = {package = "pairing_ce", version = "0.18.0" }
 byteorder = "1"
 
 futures-cpupool = {version = "0.1", optional = true}
 num_cpus = {version = "1", optional = true}
 crossbeam = {version = "0.7.1", optional = true}
 
+prefetch = {version = "0.2", optional = true}
+
 web-sys = {version = "0.3.17", optional = true, features = ["console", "Performance", "Window"]}
 
 tiny-keccak = {version = "1.4.2", optional = true}
 blake2-rfc = {version = "0.2.18", optional = true}
 
 [features]
 default = ["multicore"]
-#default = ["multicore", "gm17", "sonic"]
+#default = ["multicore", "nightly"]
 #default = ["wasm"]
 multicore = ["futures-cpupool", "num_cpus", "crossbeam"]
 sonic = ["tiny-keccak", "blake2-rfc"]
 gm17 = []
 wasm = ["web-sys"]
+nightly = ["prefetch"]
diff --git a/README.md b/README.md
@@ -2,6 +2,38 @@
 
 Originally developed for ZCash, with extensions from us to make it a little more pleasant. Uses our "community edition" pairing for Ethereum's BN256 curve. Now published as `bellman_ce` on `crate.io` to allow ease of use.
 
+## Features
+
+There are two available features to be used in production and are stable and will not be changed in terms of API. Those are Groth16 proof system implementation.
+
+- `multicore` feature (enabled by default) is intended to be run on PC and in environments that have support of a full `std` including threading.
+- `singlecore` feature is mainly intended for WASM systems, where non-compatible external crates are removed, along with all the multithreading.
+
+Due to request to have a maintainable repo with WASM compatibility those features were implemented during the implementation of GM17 and SONIC proof systems. That's why there are two more features that are incomplete and will have breaking changes in a future. Those are for interested enthusiasts.
+
+- `gm17` - is incomplete and most likely will get attention after putting SONIC to completeness.
+- `sonic` - 90% complete. Original implementation of `helped` protocol is integrated with API similar to the Groth16, along with wrapping adapters to use existing circuits without any changes. `unhelped` version is not yet complete, but all cryptographical primitives are implemented and tested. Right now it's a priority.
+
+## Future progress
+
+It's intended to add `GM17` proof system and `SONIC` proof system.
+
+## Features
+
+There are two available features to be used in production and are stable and will not be changed in terms of API. Those are Groth16 proof system implementation.
+
+- `multicore` feature (enabled by default) is intended to be run on PC and in environments that have support of a full `std` including threading.
+- `singlecore` feature is mainly intended for WASM systems, where non-compatible external crates are removed, along with all the multithreading.
+
+Due to request to have a maintainable repo with WASM compatibility those features were implemented during the implementation of GM17 and SONIC proof systems. That's why there are two more features that are incomplete and will have breaking changes in a future. Those are for interested enthusiasts.
+
+- `gm17` - is incomplete and most likely will get attention after putting SONIC to completeness.
+- `sonic` - 90% complete. Original implementation of `helped` protocol is integrated with API similar to the Groth16, along with wrapping adapters to use existing circuits without any changes. `unhelped` version is not yet complete, but all cryptographical primitives are implemented and tested. Right now it's a priority.
+
+## Future progress
+
+It's intended to add `GM17` proof system and `SONIC` proof system.
+
 ## License
 
 Licensed under either of

diff --git a/src/cs.rs b/src/cs.rs
@@ -20,7 +20,7 @@ pub trait Circuit<E: Engine> {
 }
 
 /// Represents a variable in our constraint system.
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
 pub struct Variable(pub(crate) Index);
 
 impl Variable {
@@ -39,7 +39,7 @@ impl Variable {
 
 /// Represents the index of either an input variable or
 /// auxillary variable.
-#[derive(Copy, Clone, PartialEq, Debug)]
+#[derive(Copy, Clone, PartialEq, Debug, Hash, Eq)]
 pub enum Index {
     Input(usize),
     Aux(usize)

diff --git a/src/groth16/prover.rs b/src/groth16/prover.rs
@@ -86,6 +86,48 @@ fn eval<E: Engine>(
     acc
 }
 
+pub(crate) fn field_elements_into_representations<E: Engine>(
+    worker: &Worker,
+    scalars: Vec<E::Fr>
+) -> Result<Vec<<E::Fr as PrimeField>::Repr>, SynthesisError>
+{   
+    let mut representations = vec![<E::Fr as PrimeField>::Repr::default(); scalars.len()];
+    worker.scope(scalars.len(), |scope, chunk| {
+        for (scalar, repr) in scalars.chunks(chunk)
+                    .zip(representations.chunks_mut(chunk)) {
+            scope.spawn(move |_| {
+                for (scalar, repr) in scalar.iter()
+                                        .zip(repr.iter_mut()) {
+                    *repr = scalar.into_repr();
+                }
+            });
+        }
+    });
+
+    Ok(representations)
+}
+
+pub(crate) fn scalars_into_representations<E: Engine>(
+    worker: &Worker,
+    scalars: Vec<Scalar<E>>
+) -> Result<Vec<<E::Fr as PrimeField>::Repr>, SynthesisError>
+{   
+    let mut representations = vec![<E::Fr as PrimeField>::Repr::default(); scalars.len()];
+    worker.scope(scalars.len(), |scope, chunk| {
+        for (scalar, repr) in scalars.chunks(chunk)
+                    .zip(representations.chunks_mut(chunk)) {
+            scope.spawn(move |_| {
+                for (scalar, repr) in scalar.iter()
+                                        .zip(repr.iter_mut()) {
+                    *repr = scalar.0.into_repr();
+                }
+            });
+        }
+    });
+
+    Ok(representations)
+}
+
 // This is a proving assignment with densities precalculated
 pub struct PreparedProver<E: Engine>{
     assignment: ProvingAssignment<E>,
@@ -145,7 +187,7 @@ pub fn prepare_prover<E, C>(
 
 impl<E:Engine> PreparedProver<E> {
     pub fn create_random_proof<R, P: ParameterSource<E>>(
-        & self,
+        self,
         params: P,
         rng: &mut R
     ) -> Result<Proof<E>, SynthesisError>
@@ -158,16 +200,16 @@ impl<E:Engine> PreparedProver<E> {
     }
 
     pub fn create_proof<P: ParameterSource<E>>(
-        & self,
+        self,
         mut params: P,
         r: E::Fr,
         s: E::Fr
     ) -> Result<Proof<E>, SynthesisError>
     {
-        let prover = self.assignment.clone();
+        let prover = self.assignment;
         let worker = Worker::new();
 
-        let vk = params.get_vk(self.assignment.input_assignment.len())?;
+        let vk = params.get_vk(prover.input_assignment.len())?;
 
         let stopwatch = Stopwatch::new();
 
@@ -202,7 +244,8 @@ impl<E:Engine> PreparedProver<E> {
             a.truncate(a_len);
             // TODO: parallelize if it's even helpful
             // TODO: in large settings it may worth to parallelize
-            let a = Arc::new(a.into_iter().map(|s| s.0.into_repr()).collect::<Vec<_>>());
+            let a = Arc::new(scalars_into_representations::<E>(&worker, a)?);
+            // let a = Arc::new(a.into_iter().map(|s| s.0.into_repr()).collect::<Vec<_>>());
 
             multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
         };
@@ -213,13 +256,19 @@ impl<E:Engine> PreparedProver<E> {
 
         // TODO: Check that difference in operations for different chunks is small
 
+        let input_len = prover.input_assignment.len();
+        let aux_len = prover.aux_assignment.len();
+
+        let input_assignment = Arc::new(field_elements_into_representations::<E>(&worker, prover.input_assignment)?);
+        let aux_assignment = Arc::new(field_elements_into_representations::<E>(&worker, prover.aux_assignment)?);
+
         // TODO: parallelize if it's even helpful
         // TODO: in large settings it may worth to parallelize
-        let input_assignment = Arc::new(prover.input_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
-        let aux_assignment = Arc::new(prover.aux_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
+        // let input_assignment = Arc::new(prover.input_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
+        // let aux_assignment = Arc::new(prover.aux_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
 
-        let input_len = input_assignment.len();
-        let aux_len = aux_assignment.len();
+        // let input_len = input_assignment.len();
+        // let aux_len = aux_assignment.len();
         elog_verbose!("H query is dense in G1,\nOther queries are {} elements in G1 and {} elements in G2",
             2*(input_len + aux_len) + aux_len, input_len + aux_len);
 
@@ -402,153 +451,13 @@ pub fn create_random_proof<E, C, R, P: ParameterSource<E>>(
 
 pub fn create_proof<E, C, P: ParameterSource<E>>(
     circuit: C,
-    mut params: P,
+    params: P,
     r: E::Fr,
     s: E::Fr
 ) -> Result<Proof<E>, SynthesisError>
     where E: Engine, C: Circuit<E>
 {
-    let mut prover = ProvingAssignment {
-        a_aux_density: DensityTracker::new(),
-        b_input_density: DensityTracker::new(),
-        b_aux_density: DensityTracker::new(),
-        a: vec![],
-        b: vec![],
-        c: vec![],
-        input_assignment: vec![],
-        aux_assignment: vec![]
-    };
-
-    prover.alloc_input(|| "", || Ok(E::Fr::one()))?;
+    let prover = prepare_prover(circuit)?;
 
-    circuit.synthesize(&mut prover)?;
-
-    for i in 0..prover.input_assignment.len() {
-        prover.enforce(|| "",
-            |lc| lc + Variable(Index::Input(i)),
-            |lc| lc,
-            |lc| lc,
-        );
-    }
-
-    let worker = Worker::new();
-
-    let vk = params.get_vk(prover.input_assignment.len())?;
-
-    let stopwatch = Stopwatch::new();
-
-    let h = {
-        let mut a = EvaluationDomain::from_coeffs(prover.a)?;
-        let mut b = EvaluationDomain::from_coeffs(prover.b)?;
-        let mut c = EvaluationDomain::from_coeffs(prover.c)?;
-        elog_verbose!("H query domain size is {}", a.as_ref().len());
-        // here a coset is a domain where denominator (z) does not vanish
-        // inverse FFT is an interpolation
-        a.ifft(&worker);
-        // evaluate in coset
-        a.coset_fft(&worker);
-        // same is for B and C
-        b.ifft(&worker);
-        b.coset_fft(&worker);
-        c.ifft(&worker);
-        c.coset_fft(&worker);
-
-        // do A*B-C in coset
-        a.mul_assign(&worker, &b);
-        drop(b);
-        a.sub_assign(&worker, &c);
-        drop(c);
-        // z does not vanish in coset, so we divide by non-zero
-        a.divide_by_z_on_coset(&worker);
-        // interpolate back in coset
-        a.icoset_fft(&worker);
-        let mut a = a.into_coeffs();
-        let a_len = a.len() - 1;
-        a.truncate(a_len);
-        // TODO: parallelize if it's even helpful
-        // TODO: in large settings it may worth to parallelize
-        let a = Arc::new(a.into_iter().map(|s| s.0.into_repr()).collect::<Vec<_>>());
-
-        multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
-    };
-
-    elog_verbose!("{} seconds for prover for H evaluation (mostly FFT)", stopwatch.elapsed());
-
-    let stopwatch = Stopwatch::new();
-
-    // TODO: Check that difference in operations for different chunks is small
-
-    // TODO: parallelize if it's even helpful
-    // TODO: in large settings it may worth to parallelize
-    let input_assignment = Arc::new(prover.input_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
-    let aux_assignment = Arc::new(prover.aux_assignment.into_iter().map(|s| s.into_repr()).collect::<Vec<_>>());
-
-    // Run a dedicated process for dense vector
-    let l = multiexp(&worker, params.get_l(aux_assignment.len())?, FullDensity, aux_assignment.clone());
-
-    let a_aux_density_total = prover.a_aux_density.get_total_density();
-
-    let (a_inputs_source, a_aux_source) = params.get_a(input_assignment.len(), a_aux_density_total)?;
-
-    let a_inputs = multiexp(&worker, a_inputs_source, FullDensity, input_assignment.clone());
-    let a_aux = multiexp(&worker, a_aux_source, Arc::new(prover.a_aux_density), aux_assignment.clone());
-
-    let b_input_density = Arc::new(prover.b_input_density);
-    let b_input_density_total = b_input_density.get_total_density();
-    let b_aux_density = Arc::new(prover.b_aux_density);
-    let b_aux_density_total = b_aux_density.get_total_density();
-
-    let (b_g1_inputs_source, b_g1_aux_source) = params.get_b_g1(b_input_density_total, b_aux_density_total)?;
-
-    let b_g1_inputs = multiexp(&worker, b_g1_inputs_source, b_input_density.clone(), input_assignment.clone());
-    let b_g1_aux = multiexp(&worker, b_g1_aux_source, b_aux_density.clone(), aux_assignment.clone());
-
-    let (b_g2_inputs_source, b_g2_aux_source) = params.get_b_g2(b_input_density_total, b_aux_density_total)?;
-
-    let b_g2_inputs = multiexp(&worker, b_g2_inputs_source, b_input_density, input_assignment);
-    let b_g2_aux = multiexp(&worker, b_g2_aux_source, b_aux_density, aux_assignment);
-
-    if vk.delta_g1.is_zero() || vk.delta_g2.is_zero() {
-        // If this element is zero, someone is trying to perform a
-        // subversion-CRS attack.
-        return Err(SynthesisError::UnexpectedIdentity);
-    }
-
-    let mut g_a = vk.delta_g1.mul(r);
-    g_a.add_assign_mixed(&vk.alpha_g1);
-    let mut g_b = vk.delta_g2.mul(s);
-    g_b.add_assign_mixed(&vk.beta_g2);
-    let mut g_c;
-    {
-        let mut rs = r;
-        rs.mul_assign(&s);
-
-        g_c = vk.delta_g1.mul(rs);
-        g_c.add_assign(&vk.alpha_g1.mul(s));
-        g_c.add_assign(&vk.beta_g1.mul(r));
-    }
-    let mut a_answer = a_inputs.wait()?;
-    a_answer.add_assign(&a_aux.wait()?);
-    g_a.add_assign(&a_answer);
-    a_answer.mul_assign(s);
-    g_c.add_assign(&a_answer);
-
-    let mut b1_answer = b_g1_inputs.wait()?;
-    b1_answer.add_assign(&b_g1_aux.wait()?);
-    let mut b2_answer = b_g2_inputs.wait()?;
-    b2_answer.add_assign(&b_g2_aux.wait()?);
-
-    g_b.add_assign(&b2_answer);
-    b1_answer.mul_assign(r);
-    g_c.add_assign(&b1_answer);
-    g_c.add_assign(&h.wait()?);
-    g_c.add_assign(&l.wait()?);
-
-    elog_verbose!("{} seconds for prover for point multiplication", stopwatch.elapsed());
-
-    Ok(Proof {
-        a: g_a.into_affine(),
-        b: g_b.into_affine(),
-        c: g_c.into_affine()
-    })
+    prover.create_proof(params, r, s)
 }
diff --git a/src/lib.rs b/src/lib.rs
@@ -3,7 +3,7 @@
 #[macro_use]
 
 extern crate cfg_if;
-extern crate pairing_ce as pairing_import;
+pub extern crate pairing;
 extern crate rand;
 extern crate bit_vec;
 extern crate byteorder;
@@ -43,10 +43,6 @@ cfg_if! {
     }
 }
 
-pub mod pairing {
-    pub use pairing_import::*;
-}
-
 mod cs;
 pub use self::cs::*;