Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve JPEG writing performance #36

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ default = []
compression_stats = []

[dependencies]
bytemuck = "1"
unroll = "0.1"
bytemuck = "1.13"
byteorder = "1.4"
flate2 = "1.0"
default-boxed = "0.2"
Expand Down
18 changes: 15 additions & 3 deletions src/structs/block_based_image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use crate::consts::ZIGZAG_TO_RASTER;

use super::{block_context::BlockContext, jpeg_header::JPegHeader};

use unroll::unroll_for_loops;

/// holds the 8x8 blocks for a given component. Since we do multithreaded encoding,
/// the image may only hold a subset of the components (specified by dpos_offset),
/// but they can be merged
Expand Down Expand Up @@ -190,8 +192,8 @@ impl Default for AlignedBlock {
}

impl AlignedBlock {
pub fn new(block: [i16; 64]) -> Self {
AlignedBlock { raw_data: block }
pub fn new(data: [i16; 64]) -> Self {
AlignedBlock { raw_data: data }
}

pub fn get_dc(&self) -> i16 {
Expand All @@ -202,7 +204,17 @@ impl AlignedBlock {
self.raw_data[0] = value
}

/// gets underlying array of 64 coefficients (guaranteed to be 32-byte aligned)
#[unroll_for_loops]
pub fn zigzag(&self) -> AlignedBlock {
let mut block = AlignedBlock::default();
for i in 0..64 {
block.raw_data[i] = self.raw_data[usize::from(ZIGZAG_TO_RASTER[i])];
}
return block;
}

// used for debugging
#[allow(dead_code)]
pub fn get_block(&self) -> &[i16; 64] {
return &self.raw_data;
}
Expand Down
135 changes: 93 additions & 42 deletions src/structs/jpeg_write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

use anyhow::{Context, Result};
use bytemuck::cast_ref;
use byteorder::WriteBytesExt;

use crate::{
Expand All @@ -45,8 +46,12 @@ use crate::{
use std::{io::Write, num::NonZeroI16};

use super::{
bit_writer::BitWriter, block_based_image::BlockBasedImage, jpeg_header::HuffCodes,
jpeg_position_state::JpegPositionState, lepton_format::LeptonHeader, row_spec::RowSpec,
bit_writer::BitWriter,
block_based_image::{AlignedBlock, BlockBasedImage},
jpeg_header::HuffCodes,
jpeg_position_state::JpegPositionState,
lepton_format::LeptonHeader,
row_spec::RowSpec,
thread_handoff::ThreadHandoff,
};

Expand Down Expand Up @@ -193,14 +198,11 @@ fn recode_one_mcu_row<W: Write>(

if jf.jpeg_type == JPegType::Sequential {
// unzigzag
let mut block = [0i16; 64]; // store block for coeffs
for bpos in 0..64 {
block[bpos] = current_block.get_coefficient_zigzag(bpos);
}
let mut block = current_block.zigzag();

// diff coding for dc
let dc = block[0];
block[0] -= lastdc[state.get_cmp()];
let dc = block.get_block()[0];
block.get_block_mut()[0] -= lastdc[state.get_cmp()];
lastdc[state.get_cmp()] = dc;

// encode block
Expand Down Expand Up @@ -357,59 +359,108 @@ fn encode_block_seq(
huffw: &mut BitWriter,
dctbl: &HuffCodes,
actbl: &HuffCodes,
block: &[i16; 64],
block: &AlignedBlock,
) {
// encode DC
write_coef(huffw, block[0], 0, dctbl);
// process the array of coefficients as a 4 x 16 = 64 bit integer
let block64: &[u64; 16] = cast_ref(block.get_block());

let mut bpos = 1;
// encode AC
while bpos < 64 {
// if nonzero is encountered
let mut tmp = block[bpos];
bpos += 1;
// little endian format since we want to read the 16-bit coefficients from the lowest to the highest in 64 bit chunks
let mut current_value = u64::from_le(block64[0]);

if tmp == 0 {
let mut z = 1;
// write the DC coefficent (which is the first one in the zigzag order)
write_coef(huffw, current_value as i16, 0, dctbl);

loop {
if bpos == 64 {
huffw.write(actbl.c_val[0x00].into(), actbl.c_len[0x00].into());
return;
}
// process the AC coefficients, keeping track of the number of bits left in the current 64 bit block
// we used up the first 16 bits for the DC coefficient, so start shift right and keep track of the number of bits left
current_value >>= 16;
let mut bits_remaining: u32 = 3 * 16;
let mut block_index = 0;
let mut z: u32 = 0; // number of zeros in a row * 16 (shifted because this is that way the coefficients are encoded later on)

'main: loop {
let coef = current_value as i16;
if coef != 0 {
// see if there if the first coefficient is zero
// and write the non-zero coefficient we found
write_coef(huffw, coef, z, actbl);

z = 0;
bits_remaining -= 16;
current_value >>= 16;
} else if current_value != 0 {
// otherwise if there was a non-zero in there somewhere, skip to the next one
let nonzero_position = current_value.trailing_zeros() & 0xf0;

z += nonzero_position;
bits_remaining -= nonzero_position;
current_value >>= nonzero_position;
} else {
// otherwise everything was zero, so we increment the number of zeros seen in z
// and move to the next block
z += bits_remaining;

tmp = block[bpos];
bpos += 1;
if block_index >= 15 {
break;
}

if tmp != 0 {
// if we have 16 or more zero, we need to write them in blocks of 16
while z >= 16 {
huffw.write(actbl.c_val[0xF0].into(), actbl.c_len[0xF0].into());
z -= 16;
// get the next block of 4 coefficients
block_index += 1;
bits_remaining = 4 * 16;
current_value = u64::from_le(block64[block_index]);

// if z is potentially going to go above 16 zeros in a row, moving to a seperate loop
// to handle this case, since it requires the extra logic to write extra 0xF0 codes.
// This logic is pretty rare to hit unless the block is almost entirely zero, so it's worth it to have a seperate loop to handle this case
if z >= 12 * 16 {
while current_value == 0 {
// everything remaining was zero, so we increment the number of zeros seen in z
z += bits_remaining;

if block_index >= 15 {
break 'main;
}
write_coef(huffw, tmp, z, actbl);
break;

// get the next block of 4 coefficients
block_index += 1;
bits_remaining = 4 * 16;
current_value = u64::from_le(block64[block_index]);
}

// scan for trailing zeros to left in the current 64 bit block to figure out which one wasn't zero
let nonzero_position = current_value.trailing_zeros() & 0xf0;

z += nonzero_position;
bits_remaining -= nonzero_position;
current_value >>= nonzero_position;

// if we have 16 or more zero, we need to write them in blocks of 16
while z >= 256 {
huffw.write(actbl.c_val[0xF0].into(), actbl.c_len[0xF0].into());
z -= 256;
}

z += 1;
continue 'main;
}
} else {
write_coef(huffw, tmp, 0, actbl);
}
}

// if there were trailing zeros, then write end-of-block code, otherwise unnecessary since we wrote 64 coefficients
if z != 0 {
huffw.write(actbl.c_val[0x00].into(), actbl.c_len[0x00].into());
}
}

/// encodes a coefficient which is a huffman code specifying the size followed
/// by the coefficient itself
#[inline(always)]
fn write_coef(huffw: &mut BitWriter, coef: i16, z: u8, tbl: &HuffCodes) {
fn write_coef(huffw: &mut BitWriter, coef: i16, z: u32, tbl: &HuffCodes) {
// vli encode
let (n, s) = envli(coef);
let hc = ((z & 0xf) << 4) + s;
let hc = (z as usize | s as usize) & 0xff;

// write to huffman writer (combine into single write)
let val = (u32::from(tbl.c_val[usize::from(hc)]) << s) | u32::from(n);
let new_bits = u32::from(tbl.c_len[usize::from(hc)]) + u32::from(s);
let val = (u32::from(tbl.c_val[hc]) << s) | u32::from(n);
let new_bits = u32::from(tbl.c_len[hc]) + u32::from(s);
huffw.write(val, new_bits);
}

Expand Down Expand Up @@ -437,7 +488,7 @@ fn encode_ac_prg_fs(
}

// vli encode
write_coef(huffw, tmp, z, actbl);
write_coef(huffw, tmp, z << 4, actbl);

// reset zeroes
z = 0;
Expand Down Expand Up @@ -517,7 +568,7 @@ fn encode_ac_prg_sa(
// if nonzero is encountered
else if (tmp == 1) || (tmp == -1) {
// vli encode
write_coef(huffw, tmp, z, actbl);
write_coef(huffw, tmp, z << 4, actbl);

// write correction bits
encode_crbits(huffw, correction_bits);
Expand Down
Loading