microsoft · mcroomp · May 29, 2023 · May 29, 2023 · Jun 4, 2023 · Jun 6, 2023
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -26,7 +26,8 @@ default = []
 compression_stats = []
 
 [dependencies]
-bytemuck = "1"
+unroll = "0.1"
+bytemuck = "1.13"
 byteorder = "1.4"
 flate2 = "1.0"
 default-boxed = "0.2"

diff --git a/src/structs/block_based_image.rs b/src/structs/block_based_image.rs
@@ -10,6 +10,8 @@ use crate::consts::ZIGZAG_TO_RASTER;
 
 use super::{block_context::BlockContext, jpeg_header::JPegHeader};
 
+use unroll::unroll_for_loops;
+
 /// holds the 8x8 blocks for a given component. Since we do multithreaded encoding,
 /// the image may only hold a subset of the components (specified by dpos_offset),
 /// but they can be merged
@@ -190,8 +192,8 @@ impl Default for AlignedBlock {
 }
 
 impl AlignedBlock {
-    pub fn new(block: [i16; 64]) -> Self {
-        AlignedBlock { raw_data: block }
+    pub fn new(data: [i16; 64]) -> Self {
+        AlignedBlock { raw_data: data }
     }
 
     pub fn get_dc(&self) -> i16 {
@@ -202,7 +204,17 @@ impl AlignedBlock {
         self.raw_data[0] = value
     }
 
-    /// gets underlying array of 64 coefficients (guaranteed to be 32-byte aligned)
+    #[unroll_for_loops]
+    pub fn zigzag(&self) -> AlignedBlock {
+        let mut block = AlignedBlock::default();
+        for i in 0..64 {
+            block.raw_data[i] = self.raw_data[usize::from(ZIGZAG_TO_RASTER[i])];
+        }
+        return block;
+    }
+
+    // used for debugging
+    #[allow(dead_code)]
     pub fn get_block(&self) -> &[i16; 64] {
         return &self.raw_data;
     }

diff --git a/src/structs/jpeg_write.rs b/src/structs/jpeg_write.rs
@@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
 use anyhow::{Context, Result};
+use bytemuck::cast_ref;
 use byteorder::WriteBytesExt;
 
 use crate::{
@@ -45,8 +46,12 @@ use crate::{
 use std::{io::Write, num::NonZeroI16};
 
 use super::{
-    bit_writer::BitWriter, block_based_image::BlockBasedImage, jpeg_header::HuffCodes,
-    jpeg_position_state::JpegPositionState, lepton_format::LeptonHeader, row_spec::RowSpec,
+    bit_writer::BitWriter,
+    block_based_image::{AlignedBlock, BlockBasedImage},
+    jpeg_header::HuffCodes,
+    jpeg_position_state::JpegPositionState,
+    lepton_format::LeptonHeader,
+    row_spec::RowSpec,
     thread_handoff::ThreadHandoff,
 };
 
@@ -193,14 +198,11 @@ fn recode_one_mcu_row<W: Write>(
 
             if jf.jpeg_type == JPegType::Sequential {
                 // unzigzag
-                let mut block = [0i16; 64]; // store block for coeffs
-                for bpos in 0..64 {
-                    block[bpos] = current_block.get_coefficient_zigzag(bpos);
-                }
+                let mut block = current_block.zigzag();
 
                 // diff coding for dc
-                let dc = block[0];
-                block[0] -= lastdc[state.get_cmp()];
+                let dc = block.get_block()[0];
+                block.get_block_mut()[0] -= lastdc[state.get_cmp()];
                 lastdc[state.get_cmp()] = dc;
 
                 // encode block
@@ -357,59 +359,108 @@ fn encode_block_seq(
     huffw: &mut BitWriter,
     dctbl: &HuffCodes,
     actbl: &HuffCodes,
-    block: &[i16; 64],
+    block: &AlignedBlock,
 ) {
-    // encode DC
-    write_coef(huffw, block[0], 0, dctbl);
+    // process the array of coefficients as a 4 x 16 = 64 bit integer
+    let block64: &[u64; 16] = cast_ref(block.get_block());
 
-    let mut bpos = 1;
-    // encode AC
-    while bpos < 64 {
-        // if nonzero is encountered
-        let mut tmp = block[bpos];
-        bpos += 1;
+    // little endian format since we want to read the 16-bit coefficients from the lowest to the highest in 64 bit chunks
+    let mut current_value = u64::from_le(block64[0]);
 
-        if tmp == 0 {
-            let mut z = 1;
+    // write the DC coefficent (which is the first one in the zigzag order)
+    write_coef(huffw, current_value as i16, 0, dctbl);
 
-            loop {
-                if bpos == 64 {
-                    huffw.write(actbl.c_val[0x00].into(), actbl.c_len[0x00].into());
-                    return;
-                }
+    // process the AC coefficients, keeping track of the number of bits left in the current 64 bit block
+    // we used up the first 16 bits for the DC coefficient, so start shift right and keep track of the number of bits left
+    current_value >>= 16;
+    let mut bits_remaining: u32 = 3 * 16;
+    let mut block_index = 0;
+    let mut z: u32 = 0; // number of zeros in a row * 16 (shifted because this is that way the coefficients are encoded later on)
+
+    'main: loop {
+        let coef = current_value as i16;
+        if coef != 0 {
+            // see if there if the first coefficient is zero
+            // and write the non-zero coefficient we found
+            write_coef(huffw, coef, z, actbl);
+
+            z = 0;
+            bits_remaining -= 16;
+            current_value >>= 16;
+        } else if current_value != 0 {
+            // otherwise if there was a non-zero in there somewhere, skip to the next one
+            let nonzero_position = current_value.trailing_zeros() & 0xf0;
+
+            z += nonzero_position;
+            bits_remaining -= nonzero_position;
+            current_value >>= nonzero_position;
+        } else {
+            // otherwise everything was zero, so we increment the number of zeros seen in z
+            // and move to the next block
+            z += bits_remaining;
 
-                tmp = block[bpos];
-                bpos += 1;
+            if block_index >= 15 {
+                break;
+            }
 
-                if tmp != 0 {
-                    // if we have 16 or more zero, we need to write them in blocks of 16
-                    while z >= 16 {
-                        huffw.write(actbl.c_val[0xF0].into(), actbl.c_len[0xF0].into());
-                        z -= 16;
+            // get the next block of 4 coefficients
+            block_index += 1;
+            bits_remaining = 4 * 16;
+            current_value = u64::from_le(block64[block_index]);
+
+            // if z is potentially going to go above 16 zeros in a row, moving to a seperate loop
+            // to handle this case, since it requires the extra logic to write extra 0xF0 codes.
+            // This logic is pretty rare to hit unless the block is almost entirely zero, so it's worth it to have a seperate loop to handle this case
+            if z >= 12 * 16 {
+                while current_value == 0 {
+                    // everything remaining was zero, so we increment the number of zeros seen in z
+                    z += bits_remaining;
+
+                    if block_index >= 15 {
+                        break 'main;
                     }
-                    write_coef(huffw, tmp, z, actbl);
-                    break;
+
+                    // get the next block of 4 coefficients
+                    block_index += 1;
+                    bits_remaining = 4 * 16;
+                    current_value = u64::from_le(block64[block_index]);
+                }
+
+                // scan for trailing zeros to left in the current 64 bit block to figure out which one wasn't zero
+                let nonzero_position = current_value.trailing_zeros() & 0xf0;
+
+                z += nonzero_position;
+                bits_remaining -= nonzero_position;
+                current_value >>= nonzero_position;
+
+                // if we have 16 or more zero, we need to write them in blocks of 16
+                while z >= 256 {
+                    huffw.write(actbl.c_val[0xF0].into(), actbl.c_len[0xF0].into());
+                    z -= 256;
                 }
 
-                z += 1;
+                continue 'main;
             }
-        } else {
-            write_coef(huffw, tmp, 0, actbl);
         }
     }
+
+    // if there were trailing zeros, then write end-of-block code, otherwise unnecessary since we wrote 64 coefficients
+    if z != 0 {
+        huffw.write(actbl.c_val[0x00].into(), actbl.c_len[0x00].into());
+    }
 }
 
 /// encodes a coefficient which is a huffman code specifying the size followed
 /// by the coefficient itself
 #[inline(always)]
-fn write_coef(huffw: &mut BitWriter, coef: i16, z: u8, tbl: &HuffCodes) {
+fn write_coef(huffw: &mut BitWriter, coef: i16, z: u32, tbl: &HuffCodes) {
     // vli encode
     let (n, s) = envli(coef);
-    let hc = ((z & 0xf) << 4) + s;
+    let hc = (z as usize | s as usize) & 0xff;
 
     // write to huffman writer (combine into single write)
-    let val = (u32::from(tbl.c_val[usize::from(hc)]) << s) | u32::from(n);
-    let new_bits = u32::from(tbl.c_len[usize::from(hc)]) + u32::from(s);
+    let val = (u32::from(tbl.c_val[hc]) << s) | u32::from(n);
+    let new_bits = u32::from(tbl.c_len[hc]) + u32::from(s);
     huffw.write(val, new_bits);
 }
 
@@ -437,7 +488,7 @@ fn encode_ac_prg_fs(
             }
 
             // vli encode
-            write_coef(huffw, tmp, z, actbl);
+            write_coef(huffw, tmp, z << 4, actbl);
 
             // reset zeroes
             z = 0;
@@ -517,7 +568,7 @@ fn encode_ac_prg_sa(
         // if nonzero is encountered
         else if (tmp == 1) || (tmp == -1) {
             // vli encode
-            write_coef(huffw, tmp, z, actbl);
+            write_coef(huffw, tmp, z << 4, actbl);
 
             // write correction bits
             encode_crbits(huffw, correction_bits);