diff --git a/images/mathoverflow_scalar.jpg b/images/mathoverflow_scalar.jpg
new file mode 100644
index 00000000..621a519a
Binary files /dev/null and b/images/mathoverflow_scalar.jpg differ
diff --git a/images/mathoverflow_scalar.lep b/images/mathoverflow_scalar.lep
new file mode 100644
index 00000000..cbcf24b9
Binary files /dev/null and b/images/mathoverflow_scalar.lep differ
diff --git a/src/enabled_features.rs b/src/enabled_features.rs
index de053f38..ac1828b5 100644
--- a/src/enabled_features.rs
+++ b/src/enabled_features.rs
@@ -1,4 +1,5 @@
 // features that are enabled in the encoder. Turn off for potential backward compat issues.
+#[derive(Debug, Clone, Copy)]
 pub struct EnabledFeatures {
     /// enables/disables reading of progressive images
     pub progressive: bool,
@@ -12,32 +13,52 @@ pub struct EnabledFeatures {
     // maximum jpeg height
     pub max_jpeg_height: i32,
 
-    // Sadly C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
+    /// Sadly C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
     pub use_16bit_dc_estimate: bool,
+
+    /// Sadly C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
+    pub use_16bit_adv_predict: bool,
 }
 
-impl Default for EnabledFeatures {
-    fn default() -> Self {
+impl EnabledFeatures {
+    /// parameters that allow everything for encoding that is compatible with c++ lepton compiled with SIMD
+    #[allow(dead_code)]
+    pub fn compat_lepton_vector_write() -> Self {
         Self {
             progressive: true,
             reject_dqts_with_zeros: true,
-            max_jpeg_width: 16386,
             max_jpeg_height: 16386,
-            use_16bit_dc_estimate: false,
+            max_jpeg_width: 16386,
+            use_16bit_dc_estimate: true,
+            use_16bit_adv_predict: true,
         }
     }
-}
 
-impl EnabledFeatures {
-    /// parameters that allow everything
+    /// parameters that allow everything for decoding c++ lepton images encoded
+    /// with the scalar compile options
     #[allow(dead_code)]
-    pub fn all() -> Self {
+    pub fn compat_lepton_scalar_read() -> Self {
         Self {
             progressive: true,
-            reject_dqts_with_zeros: true,
+            reject_dqts_with_zeros: false,
             max_jpeg_height: i32::MAX,
             max_jpeg_width: i32::MAX,
             use_16bit_dc_estimate: false,
+            use_16bit_adv_predict: false,
+        }
+    }
+
+    /// parameters that allow everything for decoding c++ lepton images encoded
+    /// with the vector (SSE2/AVX2) compile options
+    #[allow(dead_code)]
+    pub fn compat_lepton_vector_read() -> Self {
+        Self {
+            progressive: true,
+            reject_dqts_with_zeros: false,
+            max_jpeg_height: i32::MAX,
+            max_jpeg_width: i32::MAX,
+            use_16bit_dc_estimate: true,
+            use_16bit_adv_predict: true,
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 974b9a69..3789a560 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -96,7 +96,7 @@ pub unsafe extern "C" fn WrapperCompressImage(
             &mut reader,
             &mut writer,
             number_of_threads as usize,
-            &EnabledFeatures::default(),
+            &EnabledFeatures::compat_lepton_vector_write(),
         ) {
             Ok(_) => {}
             Err(e) => match e.root_cause().downcast_ref::<LeptonError>() {
@@ -159,14 +159,19 @@ pub unsafe extern "C" fn WrapperDecompressImageEx(
 ) -> i32 {
     match catch_unwind(|| {
         // For back-compat with C++ version we allow decompression of images with zeros in DQT tables
-        let mut enabled_features = EnabledFeatures::all();
-        enabled_features.reject_dqts_with_zeros = false;
 
         // C++ version has a bug where it uses 16 bit math in the SIMD path and 32 bit math in the scalar path
         // depending on the compiler options. If use_16bit_dc_estimate=true, the decompression uses a back-compat
         // mode that considers it. The caller should set use_16bit_dc_estimate to true only for images that were
         // compressed by C++ version with relevant compiler options.
-        enabled_features.use_16bit_dc_estimate = use_16bit_dc_estimate;
+
+        // this is a bit of a mess since for a while we were encoded a mix of 16 and 32 bit math
+        // (hence the two parameters in features).
+
+        let mut enabled_features = EnabledFeatures {
+            use_16bit_dc_estimate: use_16bit_dc_estimate,
+            ..EnabledFeatures::compat_lepton_vector_read()
+        };
 
         loop {
             let input = std::slice::from_raw_parts(input_buffer, input_buffer_size as usize);
@@ -179,7 +184,7 @@ pub unsafe extern "C" fn WrapperDecompressImageEx(
                 &mut reader,
                 &mut writer,
                 number_of_threads as usize,
-                &enabled_features,
+                &mut enabled_features,
             ) {
                 Ok(_) => {
                     *result_size = writer.position().into();
diff --git a/src/main.rs b/src/main.rs
index 3a3b81ce..49474fa3 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -52,7 +52,7 @@ fn main_with_result() -> anyhow::Result<()> {
     let mut dump = false;
     let mut all = false;
     let mut overwrite = false;
-    let mut enabled_features = EnabledFeatures::default();
+    let mut enabled_features = EnabledFeatures::compat_lepton_vector_read();
 
     // only output the log if we are connected to a console (otherwise if there is redirection we would corrupt the file)
     if stdout().is_terminal() {
@@ -81,6 +81,14 @@ fn main_with_result() -> anyhow::Result<()> {
                 enabled_features.reject_dqts_with_zeros = false;
             } else if args[i] == "-use16bitdc" {
                 enabled_features.use_16bit_dc_estimate = true;
+            } else if args[i] == "-useleptonscalar" {
+                // lepton files that were encoded by the dropbox c++ version compiled in scalar mode
+                enabled_features.use_16bit_adv_predict = false;
+                enabled_features.use_16bit_dc_estimate = false;
+            } else if args[i] == "-useleptonvector" {
+                // lepton files that were encoded by the dropbox c++ version compiled in AVX2/SSE2 mode
+                enabled_features.use_16bit_adv_predict = true;
+                enabled_features.use_16bit_dc_estimate = true;
             } else {
                 return err_exit_code(
                     ExitCode::SyntaxError,
@@ -111,7 +119,7 @@ fn main_with_result() -> anyhow::Result<()> {
                 .context(here!())?;
         } else {
             lh = LeptonHeader::new();
-            lh.read_lepton_header(&mut reader, &enabled_features)
+            lh.read_lepton_header(&mut reader, &mut enabled_features)
                 .context(here!())?;
 
             let _metrics;
@@ -131,7 +139,7 @@ fn main_with_result() -> anyhow::Result<()> {
                 println!("{0}", s.replace("},", "},\r\n").replace("],", "],\r\n"));
 
                 if !lh
-                    .advance_next_header_segment(&EnabledFeatures::default())
+                    .advance_next_header_segment(&enabled_features)
                     .context(here!())?
                 {
                     break;
diff --git a/src/structs/lepton_decoder.rs b/src/structs/lepton_decoder.rs
index c079cc17..c420413c 100644
--- a/src/structs/lepton_decoder.rs
+++ b/src/structs/lepton_decoder.rs
@@ -360,7 +360,8 @@ fn parse_token<R: Read, const ALL_PRESENT: bool>(
         eob_y,
     )?;
 
-    let predicted_dc = pt.adv_predict_dc_pix::<ALL_PRESENT>(&output, qt, context, num_non_zeros);
+    let predicted_dc =
+        pt.adv_predict_dc_pix::<ALL_PRESENT>(&output, qt, context, num_non_zeros, features);
 
     let coef = model
         .read_dc(
diff --git a/src/structs/lepton_encoder.rs b/src/structs/lepton_encoder.rs
index 674c2d2a..61a336a3 100644
--- a/src/structs/lepton_encoder.rs
+++ b/src/structs/lepton_encoder.rs
@@ -378,7 +378,8 @@ fn serialize_tokens<W: Write, const ALL_PRESENT: bool>(
     )
     .context(here!())?;
 
-    let predicted_val = pt.adv_predict_dc_pix::<ALL_PRESENT>(&block, qt, context, &num_non_zeros);
+    let predicted_val =
+        pt.adv_predict_dc_pix::<ALL_PRESENT>(&block, qt, context, &num_non_zeros, features);
 
     let avg_predicted_dc = ProbabilityTables::adv_predict_or_unpredict_dc(
         block.get_dc(),
diff --git a/src/structs/lepton_format.rs b/src/structs/lepton_format.rs
index e7c96c98..2c2c694a 100644
--- a/src/structs/lepton_format.rs
+++ b/src/structs/lepton_format.rs
@@ -56,11 +56,13 @@ pub fn decode_lepton_wrapper<R: Read + Seek, W: Write>(
 
     let mut lh = LeptonHeader::new();
 
-    lh.read_lepton_header(reader, enabled_features)
+    let mut features_mut = enabled_features.clone();
+
+    lh.read_lepton_header(reader, &mut features_mut)
         .context(here!())?;
 
     let metrics = lh
-        .recode_jpeg(writer, reader, size, num_threads, enabled_features)
+        .recode_jpeg(writer, reader, size, num_threads, &features_mut)
         .context(here!())?;
 
     return Ok(metrics);
@@ -75,7 +77,8 @@ pub fn encode_lepton_wrapper<R: Read + Seek, W: Write + Seek>(
 ) -> Result<Metrics> {
     let (lp, image_data) = read_jpeg(reader, enabled_features, max_threads, |_jh| {})?;
 
-    lp.write_lepton_header(writer).context(here!())?;
+    lp.write_lepton_header(writer, enabled_features)
+        .context(here!())?;
 
     let metrics = run_lepton_encoder_threads(
         &lp.jpeg_header,
@@ -125,14 +128,11 @@ pub fn encode_lepton_wrapper_verify(
 
     info!("decompressing to verify contents");
 
+    let mut c = enabled_features.clone();
+
     metrics.merge_from(
-        decode_lepton_wrapper(
-            &mut verifyreader,
-            &mut verify_buffer,
-            max_threads,
-            &enabled_features,
-        )
-        .context(here!())?,
+        decode_lepton_wrapper(&mut verifyreader, &mut verify_buffer, max_threads, &mut c)
+            .context(here!())?,
     );
 
     if input_data.len() != verify_buffer.len() {
@@ -997,7 +997,7 @@ impl LeptonHeader {
     pub fn read_lepton_header<R: Read>(
         &mut self,
         reader: &mut R,
-        enabled_features: &EnabledFeatures,
+        enabled_features: &mut EnabledFeatures,
     ) -> Result<()> {
         let mut header = [0 as u8; LEPTON_FILE_HEADER.len()];
 
@@ -1039,6 +1039,13 @@ impl LeptonHeader {
         if header[5] == 'M' as u8 && header[6] == 'S' as u8 {
             c.set_position(7);
             self.uncompressed_lepton_header_size = c.read_u32::<LittleEndian>()?;
+
+            // read the flag bits to know how we should decode this file
+            let flags = c.read_u8()?;
+            if (flags & 0x80) != 0 {
+                enabled_features.use_16bit_dc_estimate = (flags & 0x01) != 0;
+                enabled_features.use_16bit_adv_predict = (flags & 0x02) != 0;
+            }
         }
 
         // full size of the original file
@@ -1230,7 +1237,11 @@ impl LeptonHeader {
         return Ok(hdr_data);
     }
 
-    pub fn write_lepton_header<W: Write>(&self, writer: &mut W) -> Result<()> {
+    pub fn write_lepton_header<W: Write>(
+        &self,
+        writer: &mut W,
+        enabled_features: &EnabledFeatures,
+    ) -> Result<()> {
         let mut lepton_header = Vec::<u8>::new();
 
         {
@@ -1273,7 +1284,21 @@ impl LeptonHeader {
         writer.write_u8('M' as u8)?;
         writer.write_u8('S' as u8)?;
         writer.write_u32::<LittleEndian>(lepton_header.len() as u32)?;
-        writer.write_all(&[0; 6])?;
+
+        // write the flags that were used to encode this file
+        writer.write_u8(
+            0x80 | if enabled_features.use_16bit_dc_estimate {
+                1
+            } else {
+                0
+            } | if enabled_features.use_16bit_adv_predict {
+                2
+            } else {
+                0
+            },
+        )?;
+
+        writer.write_all(&[0; 5])?;
 
         writer.write_u32::<LittleEndian>(self.jpeg_file_size)?;
         writer.write_u32::<LittleEndian>(compressed_header.len() as u32)?;
@@ -1706,10 +1731,12 @@ fn parse_and_write_header() {
         0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00, 0xd2, 0xcf, 0x20, 0xff, 0xd9, // EOI
     ];
 
+    let mut enabled_features = EnabledFeatures::compat_lepton_vector_read();
+
     let mut lh = LeptonHeader::new();
     lh.jpeg_file_size = 123;
 
-    lh.parse_jpeg_header(&mut Cursor::new(min_jpeg), &EnabledFeatures::all())
+    lh.parse_jpeg_header(&mut Cursor::new(min_jpeg), &enabled_features)
         .unwrap();
     lh.thread_handoff.push(ThreadHandoff {
         luma_y_start: 0,
@@ -1722,12 +1749,12 @@ fn parse_and_write_header() {
     });
 
     let mut serialized = Vec::new();
-    lh.write_lepton_header(&mut Cursor::new(&mut serialized))
+    lh.write_lepton_header(&mut Cursor::new(&mut serialized), &enabled_features)
         .unwrap();
 
     let mut other = LeptonHeader::new();
     let mut other_reader = Cursor::new(&serialized);
     other
-        .read_lepton_header(&mut other_reader, &EnabledFeatures::all())
+        .read_lepton_header(&mut other_reader, &mut enabled_features)
         .unwrap();
 }
diff --git a/src/structs/probability_tables.rs b/src/structs/probability_tables.rs
index 347244a1..beb61be3 100644
--- a/src/structs/probability_tables.rs
+++ b/src/structs/probability_tables.rs
@@ -7,6 +7,7 @@
 use std::cmp;
 
 use crate::consts::*;
+use crate::enabled_features;
 use crate::helpers::*;
 use crate::structs::idct::*;
 use crate::structs::model::*;
@@ -258,6 +259,7 @@ impl ProbabilityTables {
         qt: &QuantizationTables,
         block_context: &BlockContext,
         num_non_zeros: &[NeighborSummary],
+        enabled_features: &enabled_features::EnabledFeatures,
     ) -> PredictDCResult {
         let q_transposed = qt.get_quantization_table_transposed();
 
@@ -268,25 +270,51 @@ impl ProbabilityTables {
         let calc_left = || {
             let left_context = block_context.neighbor_context_left(num_non_zeros);
 
-            let a1 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 0, 8);
-            let a2 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 1, 8);
-            let pixel_delta = a1 - a2;
-            let a: i16x8 = a1 + 1024;
-            let b : i16x8 = i16x8::new(*left_context.get_vertical()) - (pixel_delta - (pixel_delta>>15) >> 1) /* divide pixel_delta by 2 rounding towards 0 */;
+            if enabled_features.use_16bit_adv_predict {
+                let a1 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 0, 8);
+                let a2 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 1, 8);
+                let pixel_delta = a1 - a2;
+                let a: i16x8 = a1 + 1024;
+                let b : i16x8 = i16x8::new(*left_context.get_vertical()) - (pixel_delta - (pixel_delta>>15) >> 1) /* divide pixel_delta by 2 rounding towards 0 */;
 
-            b - a
+                b - a
+            } else {
+                let mut dc_estimates = [0i16; 8];
+                for i in 0..8 {
+                    let a = i32::from(pixels_sans_dc.get_block()[i << 3]) + 1024;
+                    let pixel_delta = i32::from(pixels_sans_dc.get_block()[i << 3])
+                        - i32::from(pixels_sans_dc.get_block()[(i << 3) + 1]);
+                    let b = i32::from(left_context.get_vertical()[i]) - (pixel_delta / 2); //round to zero
+                    dc_estimates[i] = (b - a) as i16;
+                }
+
+                i16x8::from(dc_estimates)
+            }
         };
 
         let calc_above = || {
             let above_context = block_context.neighbor_context_above(num_non_zeros);
 
-            let a1 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 0, 1);
-            let a2 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 8, 1);
-            let pixel_delta = a1 - a2;
-            let a: i16x8 = a1 + 1024;
-            let b : i16x8 = i16x8::new(*above_context.get_horizontal()) - (pixel_delta - (pixel_delta>>15) >> 1) /* divide pixel_delta by 2 rounding towards 0 */;
+            if enabled_features.use_16bit_adv_predict {
+                let a1 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 0, 1);
+                let a2 = ProbabilityTables::from_stride(&pixels_sans_dc.get_block(), 8, 1);
+                let pixel_delta = a1 - a2;
+                let a: i16x8 = a1 + 1024;
+                let b : i16x8 = i16x8::new(*above_context.get_horizontal()) - (pixel_delta - (pixel_delta>>15) >> 1) /* divide pixel_delta by 2 rounding towards 0 */;
 
-            b - a
+                b - a
+            } else {
+                let mut dc_estimates = [0i16; 8];
+                for i in 0..8 {
+                    let a = i32::from(pixels_sans_dc.get_block()[i]) + 1024;
+                    let pixel_delta = i32::from(pixels_sans_dc.get_block()[i])
+                        - i32::from(pixels_sans_dc.get_block()[i + 8]);
+                    let b = i32::from(above_context.get_horizontal()[i]) - (pixel_delta / 2); //round to zero
+                    dc_estimates[i] = (b - a) as i16;
+                }
+
+                i16x8::from(dc_estimates)
+            }
         };
 
         let min_dc;
diff --git a/tests/end_to_end.rs b/tests/end_to_end.rs
index 884ea4df..6c95ee09 100644
--- a/tests/end_to_end.rs
+++ b/tests/end_to_end.rs
@@ -83,13 +83,33 @@ fn verify_decode(
         &mut Cursor::new(input),
         &mut output,
         8,
-        &EnabledFeatures::all(),
+        &EnabledFeatures::compat_lepton_vector_read(),
     )
     .unwrap();
 
     assert!(output[..] == expected[..]);
 }
 
+/// verifies that the decode will accept existing Lepton files and generate
+/// exactly the same jpeg from them. Used to detect unexpected divergences in coding format.
+#[test]
+fn verify_decode_scalar_overflow() {
+    let file = "mathoverflow_scalar";
+
+    println!("decoding {0:?}", file);
+
+    let input = read_file(file, ".lep");
+    let expected = read_file(file, ".jpg");
+
+    let mut output = Vec::new();
+
+    let features = EnabledFeatures::compat_lepton_scalar_read();
+
+    decode_lepton(&mut Cursor::new(input), &mut output, 8, &features).unwrap();
+
+    assert!(output[..] == expected[..]);
+}
+
 /// Verifies that the decode will accept existing Lepton files and generate
 /// exactly the same jpeg from them when called by an external interface
 /// with use_16bit_dc_estimate=true for C++ backward compatibility.
@@ -208,7 +228,7 @@ fn verify_encode(
         &mut Cursor::new(&input),
         &mut Cursor::new(&mut lepton),
         8,
-        &EnabledFeatures::all(),
+        &EnabledFeatures::compat_lepton_vector_write(),
     )
     .unwrap();
 
@@ -216,7 +236,7 @@ fn verify_encode(
         &mut Cursor::new(lepton),
         &mut output,
         8,
-        &EnabledFeatures::all(),
+        &EnabledFeatures::compat_lepton_vector_read(),
     )
     .unwrap();
 
@@ -232,8 +252,7 @@ fn verify_16bitmath() {
 
         let mut output = Vec::new();
 
-        let mut features = EnabledFeatures::all();
-        features.use_16bit_dc_estimate = true;
+        let features = EnabledFeatures::compat_lepton_vector_read();
 
         decode_lepton(&mut Cursor::new(input), &mut output, 8, &features).unwrap();
 
@@ -247,7 +266,7 @@ fn verify_16bitmath() {
 
         let mut output = Vec::new();
 
-        let mut features = EnabledFeatures::all();
+        let mut features = EnabledFeatures::compat_lepton_vector_read();
         features.use_16bit_dc_estimate = false;
 
         decode_lepton(&mut Cursor::new(input), &mut output, 8, &features).unwrap();
@@ -290,7 +309,12 @@ fn verify_extern_16bit_math_retry() {
 fn verify_encode_verify(#[values("slrcity")] file: &str) {
     let input = read_file(file, ".jpg");
 
-    encode_lepton_verify(&input[..], 8, &EnabledFeatures::all()).unwrap();
+    encode_lepton_verify(
+        &input[..],
+        8,
+        &EnabledFeatures::compat_lepton_vector_write(),
+    )
+    .unwrap();
 }
 
 fn assert_exception(expected_error: ExitCode, result: Result<Metrics, LeptonError>) {
@@ -317,7 +341,7 @@ fn verify_encode_progressive_false(
             8,
             &EnabledFeatures {
                 progressive: false,
-                ..Default::default()
+                ..EnabledFeatures::compat_lepton_vector_write()
             },
         ),
     );
@@ -335,7 +359,7 @@ fn verify_nonoptimal() {
             &mut Cursor::new(&input),
             &mut Cursor::new(&mut lepton),
             8,
-            &EnabledFeatures::all(),
+            &EnabledFeatures::compat_lepton_vector_write(),
         ),
     );
 }
@@ -345,13 +369,14 @@ fn verify_nonoptimal() {
 fn verify_encode_image_with_zeros_in_dqt_tables() {
     let input = read_file("zeros_in_dqt_tables", ".jpg");
     let mut lepton = Vec::new();
+
     assert_exception(
         ExitCode::UnsupportedJpeg,
         encode_lepton(
             &mut Cursor::new(&input),
             &mut Cursor::new(&mut lepton),
             8,
-            &EnabledFeatures::all(),
+            &EnabledFeatures::compat_lepton_vector_write(),
         ),
     );
 }