diff --git a/README_ASIC.md b/README_ASIC.md
index 7a925f06..2baea37e 100644
--- a/README_ASIC.md
+++ b/README_ASIC.md
@@ -58,10 +58,11 @@ List of options [default, min - max]:
         Allows connecting to a timelord running on a remote host. Useful when running multiple machines with VDF hardware connecting to a single timelord.
   --vdfs-mask N - mask for enabling VDF engines [7, 1 - 7]
         The ASIC has 3 VDF engines numbered 0, 1, 2. If not running all 3 engines, the mask can be specified to enable specific engines. It must be the result of bitwise OR of the engine bits (1, 2, 4 for engines 0, 1, 2).
-  --vdf-threads N - number of software threads per VDF engine [4, 2 - 64]
+  --vdf-threads N - max number of software threads per VDF engine [4, 2 - 64]
         Number of software threads computing intermediate values and proofs per VDF engine.
-  --proof-threads N - number of proof threads per VDF engine
+  --proof-threads N - max number of proof threads per VDF engine
         Number of software threads only computing proofs per VDF engine. Must be less than --vdf-threads.
+  --segment-threads N - number of proof threads per segment [2, 1 - 8]
   --auto-freq-period N - auto-adjust frequency every N seconds [0, 10 - inf]
   --list - list available devices and exit
 ```
diff --git a/src/hw/hw_proof.cpp b/src/hw/hw_proof.cpp
index 3ea63ee6..23caaa4e 100644
--- a/src/hw/hw_proof.cpp
+++ b/src/hw/hw_proof.cpp
@@ -6,7 +6,7 @@
 #include <cstdlib>
 #include <unistd.h>
 
-static const uint32_t g_chkp_thres = 1000000;
+static const uint32_t g_chkp_thres = 500000;
 static const uint32_t g_skip_thres = 10;
 
 void report_bad_vdf_value(struct vdf_state *vdf, struct vdf_value *val)
@@ -297,13 +297,47 @@ bool hw_proof_should_queue(struct vdf_state *vdf, uint64_t iters)
     return iters < vdf->proofs[last_queued_idx].iters;
 }
 
+void hw_proof_queue_proofs(struct vdf_state *vdf, uint64_t iters, uint16_t prev)
+{
+    uint32_t s = vdf->segment_threads;
+    uint32_t chkp_thres = g_chkp_thres * s;
+    double f = 1.0 / s;
+    // Calculate the share of the total iterations for the first proof segment
+    // The formula below is for a 2-segment case
+    double x = (f + 1) / (2 * f + 1);
+
+    if (iters > chkp_thres) {
+        // At least 2 segments are needed, find length of the first segment
+        uint64_t chkp_iters = iters * x;
+        if (iters - chkp_iters > chkp_thres) {
+            // Three segments are needed
+            uint64_t chkp2_iters;
+            double p = f + 1.0;
+            double k = p / f;
+            // Share of the total iterations for the second segment (of 3)
+            double y = p / (k * p + 2.0 * f + 1.0);
+            x = y * k;
+
+            chkp_iters = iters * x;
+            chkp_iters = chkp_iters / vdf->interval * vdf->interval;
+            prev = hw_queue_proof(vdf, chkp_iters, prev, 0);
+
+            chkp2_iters = iters * y;
+            iters -= chkp_iters;
+            chkp_iters = chkp2_iters;
+        }
+        chkp_iters = chkp_iters / vdf->interval * vdf->interval;
+        prev = hw_queue_proof(vdf, chkp_iters, prev, 0);
+        iters -= chkp_iters;
+    }
+    hw_queue_proof(vdf, iters, prev, HW_VDF_PROOF_FLAG_IS_REQ);
+}
+
 void hw_proof_process_req(struct vdf_state *vdf)
 {
     uint64_t iters;
     uint64_t req_iters;
     uint64_t base_iters = 0;
-    uint64_t chkp_iters;
-    uint32_t chkp_div = 4, chkp_mul = 3;
     uint8_t max_chkp_segments = 64 - 3;
     int i;
     uint16_t prev = HW_VDF_PROOF_NONE;
@@ -348,27 +382,7 @@ void hw_proof_process_req(struct vdf_state *vdf)
 
     iters = req_iters - base_iters;
 
-    if (iters > g_chkp_thres) {
-        // Split iters as [75%, 25%]
-        chkp_iters = iters * chkp_mul / chkp_div;
-        if (iters - chkp_iters > g_chkp_thres) {
-            // Split iters as [69%, 23%, 8%]
-            uint32_t chkp2_mul[] = { 69, 69 + 23 };
-            uint64_t chkp2_iters;
-
-            chkp_iters = iters * chkp2_mul[0] / 100;
-            chkp_iters = chkp_iters / vdf->interval * vdf->interval;
-            prev = hw_queue_proof(vdf, chkp_iters, prev, 0);
-
-            chkp2_iters = iters * chkp2_mul[1] / 100 - chkp_iters;
-            iters -= chkp_iters;
-            chkp_iters = chkp2_iters;
-        }
-        chkp_iters = chkp_iters / vdf->interval * vdf->interval;
-        prev = hw_queue_proof(vdf, chkp_iters, prev, 0);
-        iters -= chkp_iters;
-    }
-    hw_queue_proof(vdf, iters, prev, HW_VDF_PROOF_FLAG_IS_REQ);
+    hw_proof_queue_proofs(vdf, iters, prev);
 
     {
         ProofCmp cmp(vdf->proofs);
@@ -431,7 +445,7 @@ void hw_proof_process_work(struct vdf_state *vdf)
                     vdf->idx, i, iters, proof->seg_iters, is_chkp ? " [checkpoint]" : "");
             vdf->queued_proofs.erase(vdf->queued_proofs.begin());
             vdf->aux_threads_busy |= 1UL << i;
-            vdf->n_proof_threads += PARALLEL_PROVER_N_THREADS;
+            vdf->n_proof_threads += vdf->segment_threads;
             proof->flags |= HW_VDF_PROOF_FLAG_STARTED;
             std::thread(hw_compute_proof, vdf, idx, proof, i).detach();
         }
@@ -569,7 +583,7 @@ void hw_stop_proof(struct vdf_state *vdf)
 class HwProver : public ParallelProver {
   public:
     HwProver(Segment segm, integer D, struct vdf_state *vdf)
-        : ParallelProver(segm, D)
+        : ParallelProver(segm, D, vdf->segment_threads)
     {
         this->vdf = vdf;
         k = FindK(segm.length);
@@ -677,7 +691,7 @@ void hw_compute_proof(struct vdf_state *vdf, size_t proof_idx, struct vdf_proof
         Segment seg(start_iters, proof_iters - start_iters, x, y);
         HwProver prover(seg, vdf->D, vdf);
 
-        if (!is_chkp && seg.length > g_chkp_thres) {
+        if (!is_chkp && seg.length > g_chkp_thres * vdf->segment_threads) {
             LOG_INFO("VDF %d: Warning: too long final proof segment length=%lu",
                     vdf->idx, seg.length);
         }
@@ -727,7 +741,7 @@ void hw_compute_proof(struct vdf_state *vdf, size_t proof_idx, struct vdf_proof
 out:
     if (thr_idx < vdf->max_aux_threads) {
         vdf->aux_threads_busy &= ~(1UL << thr_idx);
-        vdf->n_proof_threads -= PARALLEL_PROVER_N_THREADS;
+        vdf->n_proof_threads -= vdf->segment_threads;
     }
 }
 
@@ -806,6 +820,10 @@ void init_vdf_state(struct vdf_state *vdf, struct vdf_proof_opts *opts, const ch
     if (opts && opts->max_proof_threads) {
         vdf->max_proof_threads = opts->max_proof_threads;
     }
+    vdf->segment_threads = 2;
+    if (opts && opts->segment_threads) {
+        vdf->segment_threads = opts->segment_threads;
+    }
 
     mpz_set_str(vdf->D.impl, d_str, 0);
     mpz_set(vdf->L.impl, vdf->D.impl);
diff --git a/src/hw/hw_proof.hpp b/src/hw/hw_proof.hpp
index 36684131..b70a5151 100644
--- a/src/hw/hw_proof.hpp
+++ b/src/hw/hw_proof.hpp
@@ -51,6 +51,7 @@ struct vdf_proof {
 struct vdf_proof_opts {
     uint8_t max_aux_threads;
     uint8_t max_proof_threads;
+    uint8_t segment_threads;
 };
 
 struct vdf_state {
@@ -85,6 +86,7 @@ struct vdf_state {
     uint8_t idx;
     uint8_t max_aux_threads;
     uint8_t max_proof_threads;
+    uint8_t segment_threads;
     bool completed;
     bool stopping;
     bool init_done;
diff --git a/src/hw/hw_vdf_client.cpp b/src/hw/hw_vdf_client.cpp
index c27ac51a..2a00d6df 100644
--- a/src/hw/hw_vdf_client.cpp
+++ b/src/hw/hw_vdf_client.cpp
@@ -419,6 +419,7 @@ int parse_opts(int argc, char **argv, struct vdf_client_opts *opts)
         {"vdfs-mask", required_argument, NULL, 1},
         {"vdf-threads", required_argument, NULL, 1},
         {"proof-threads", required_argument, NULL, 1},
+        {"segment-threads", required_argument, NULL, 1},
         {"list", no_argument, NULL, 1},
         {"auto-freq-period", required_argument, NULL, 1},
         {0}
@@ -435,6 +436,7 @@ int parse_opts(int argc, char **argv, struct vdf_client_opts *opts)
     opts->auto_freq = false;
     opts->vpo.max_aux_threads = HW_VDF_DEFAULT_MAX_AUX_THREADS;
     opts->vpo.max_proof_threads = 0;
+    opts->vpo.segment_threads = 0;
     opts->vdfs_mask = 0;
 
     while ((ret = getopt_long(argc, argv, "", long_opts, &long_idx)) == 1) {
@@ -451,8 +453,10 @@ int parse_opts(int argc, char **argv, struct vdf_client_opts *opts)
         } else if (long_idx == 5) {
             opts->vpo.max_proof_threads = strtoul(optarg, NULL, 0);
         } else if (long_idx == 6) {
-            opts->do_list = true;
+            opts->vpo.segment_threads = strtoul(optarg, NULL, 0);
         } else if (long_idx == 7) {
+            opts->do_list = true;
+        } else if (long_idx == 8) {
             opts->auto_freq = true;
             opts->auto_freq_period = strtoul(optarg, NULL, 0);
         }
@@ -493,6 +497,10 @@ int parse_opts(int argc, char **argv, struct vdf_client_opts *opts)
         LOG_SIMPLE("Number of proof threads must be less than VDF threads");
         return -1;
     }
+    if (opts->vpo.segment_threads > 8) {
+        LOG_SIMPLE("Number of proof threads per segment must be between 1 and 8");
+        return -1;
+    }
     if (opts->auto_freq && opts->auto_freq_period < 10) {
         LOG_SIMPLE("Invalid auto freq period");
         return -1;
@@ -525,8 +533,9 @@ int hw_vdf_client_main(int argc, char **argv)
                 "  --voltage N - set board voltage [%.2f, 0.7 - 1.0]\n"
                 "  --ip A.B.C.D - timelord IP address [localhost]\n"
                 "  --vdfs-mask N - mask for enabling VDF engines [7, 1 - 7]\n"
-                "  --vdf-threads N - number of software threads per VDF engine [4, 2 - 64]\n"
-                "  --proof-threads N - number of proof threads per VDF engine\n"
+                "  --vdf-threads N - max number of software threads per VDF engine [4, 2 - 64]\n"
+                "  --proof-threads N - max number of proof threads per VDF engine\n"
+                "  --segment-threads N - number of proof threads per segment [2, 1 - 8]\n"
                 "  --auto-freq-period N - auto-adjust frequency every N seconds [0, 10 - inf]\n"
                 "  --list - list available devices and exit",
                 argv[0], (int)HW_VDF_DEF_FREQ, HW_VDF_DEF_VOLTAGE);
diff --git a/src/prover_parallel.hpp b/src/prover_parallel.hpp
index 6967477e..18551db0 100644
--- a/src/prover_parallel.hpp
+++ b/src/prover_parallel.hpp
@@ -6,7 +6,7 @@
 #include "proof_common.h"
 #include "util.h"
 
-#define PARALLEL_PROVER_N_THREADS 2
+#define PROVER_MAX_SEGMENT_THREADS 8
 
 class ParallelProver : public Prover {
   private:
@@ -76,33 +76,47 @@ class ParallelProver : public Prover {
         x_vals[thr_idx] = x;
     }
   public:
-    ParallelProver(Segment segm, integer D) : Prover(segm, D) {}
+    ParallelProver(Segment segm, integer D, size_t n_thr) : Prover(segm, D) {
+        this->n_threads = n_thr;
+    }
     void GenerateProof();
 
   protected:
     integer B;
     integer L;
     form id;
-    form x_vals[PARALLEL_PROVER_N_THREADS];
+    form x_vals[PROVER_MAX_SEGMENT_THREADS];
+    size_t n_threads;
 };
 
 void ParallelProver::GenerateProof() {
     PulmarkReducer reducer;
+    uint32_t len = l / n_threads;
+    uint32_t rem = l % n_threads;
+    uint32_t start = l;
+    std::thread threads[PROVER_MAX_SEGMENT_THREADS];
 
     this->B = GetB(D, segm.x, segm.y);
     this->L = root(-D, 4);
     this->id = form::identity(D);
 
-    uint32_t l0 = l / 2;
-    uint32_t l1 = l - l0;
-    std::thread proof_thr(ParallelProver::ProofThread, this, 0, l, l0);
-    ProvePart(1, l1, l1);
+    for (size_t i = 0; i < n_threads; i++) {
+        uint32_t cur_len = len + (i < rem);
+        threads[i] = std::thread(ParallelProver::ProofThread, this, i, start, cur_len);
+        start -= cur_len;
+    }
 
-    proof_thr.join();
+    for (size_t i = 0; i < n_threads; i++) {
+        threads[i].join();
+    }
     if (!PerformExtraStep()) {
         return;
     }
-    nucomp_form(proof, x_vals[0], x_vals[1], D, L);
+
+    proof = x_vals[0];
+    for (size_t i = 1; i < n_threads; i++) {
+        nucomp_form(proof, proof, x_vals[i], D, L);
+    }
     reducer.reduce(proof);
     OnFinish();
 }
diff --git a/src/vdf_base.hpp b/src/vdf_base.hpp
index dba3c29b..61cc9328 100644
--- a/src/vdf_base.hpp
+++ b/src/vdf_base.hpp
@@ -295,17 +295,20 @@ class Prover {
     std::atomic<bool> is_finished;
 };
 
-#define PARALLEL_PROVER_N_THREADS 2
+#define PROVER_MAX_SEGMENT_THREADS 8
 
 class ParallelProver : public Prover {
   public:
-    ParallelProver(Segment segm, integer D) : Prover(segm, D) {}
+    ParallelProver(Segment segm, integer D, size_t n_thr) : Prover(segm, D) {
+        this->n_threads = n_thr;
+    }
     void GenerateProof();
   protected:
     integer B;
     integer L;
     form id;
-    form x_vals[PARALLEL_PROVER_N_THREADS];
+    form x_vals[PROVER_MAX_SEGMENT_THREADS];
+    size_t n_threads;
 };
 
 void nudupl_form(form &a, form &b, integer &D, integer &L);