From 9764ac13a1f3e0bf548135b5f7995beb02b24c92 Mon Sep 17 00:00:00 2001 From: "Evgeniy A. Dushistov" Date: Sat, 23 May 2020 18:53:22 +0300 Subject: [PATCH 1/2] infra/apply_mask: test more case for apply_mask make sure that input with len 0,1,2,3 also tested --- src/protocol/frame/mask.rs | 39 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/src/protocol/frame/mask.rs b/src/protocol/frame/mask.rs index b357795..0ebcf55 100644 --- a/src/protocol/frame/mask.rs +++ b/src/protocol/frame/mask.rs @@ -1,4 +1,3 @@ -use rand; use std::cmp::min; #[allow(deprecated)] use std::mem::uninitialized; @@ -87,8 +86,7 @@ unsafe fn xor_mem(ptr: *mut u8, mask: u32, len: usize) { #[cfg(test)] mod tests { - - use super::{apply_mask_fallback, apply_mask_fast32}; + use super::*; #[test] fn test_apply_mask() { @@ -98,26 +96,21 @@ mod tests { 0x12, 0x03, ]; - // Check masking with proper alignment. - { - let mut masked = unmasked.clone(); - apply_mask_fallback(&mut masked, mask); - - let mut masked_fast = unmasked.clone(); - apply_mask_fast32(&mut masked_fast, mask); - - assert_eq!(masked, masked_fast); - } - - // Check masking without alignment. - { - let mut masked = unmasked.clone(); - apply_mask_fallback(&mut masked[1..], mask); - - let mut masked_fast = unmasked.clone(); - apply_mask_fast32(&mut masked_fast[1..], mask); - - assert_eq!(masked, masked_fast); + for data_len in 0..=unmasked.len() { + let unmasked = &unmasked[0..data_len]; + // Check masking with different alignment. + for off in 0..=3 { + if unmasked.len() < off { + continue; + } + let mut masked = unmasked.to_vec(); + apply_mask_fallback(&mut masked[off..], mask); + + let mut masked_fast = unmasked.to_vec(); + apply_mask_fast32(&mut masked_fast[off..], mask); + + assert_eq!(masked, masked_fast); + } } } } From 5a95f12c6d07b9bad4369ee1886f54b3eb527810 Mon Sep 17 00:00:00 2001 From: "Evgeniy A. Dushistov" Date: Sat, 23 May 2020 18:57:09 +0300 Subject: [PATCH 2/2] refactoring/apply_mask: reduce number of usage of unsafe to 1 Plus it should be faster now: apply_mask/fallback/0 time: [24.282 ns 24.727 ns 25.255 ns] apply_mask/fast/0 time: [11.921 ns 11.963 ns 12.011 ns] apply_mask/fast_safe/0 time: [7.9340 ns 7.9807 ns 8.0261 ns] apply_mask/fallback/1 time: [25.284 ns 25.710 ns 26.124 ns] apply_mask/fast/1 time: [20.433 ns 20.476 ns 20.511 ns] apply_mask/fast_safe/1 time: [9.3208 ns 9.3833 ns 9.4470 ns] apply_mask/fallback/2 time: [16.051 ns 16.160 ns 16.275 ns] apply_mask/fast/2 time: [20.564 ns 20.569 ns 20.574 ns] apply_mask/fast_safe/2 time: [9.1449 ns 9.1830 ns 9.2189 ns] apply_mask/fallback/3 time: [15.386 ns 15.548 ns 15.715 ns] apply_mask/fast/3 time: [18.836 ns 18.867 ns 18.917 ns] apply_mask/fast_safe/3 time: [8.3092 ns 8.3566 ns 8.4076 ns] --- src/protocol/frame/mask.rs | 59 ++++++-------------------------------- 1 file changed, 8 insertions(+), 51 deletions(-) diff --git a/src/protocol/frame/mask.rs b/src/protocol/frame/mask.rs index 0ebcf55..28f0eaf 100644 --- a/src/protocol/frame/mask.rs +++ b/src/protocol/frame/mask.rs @@ -1,8 +1,3 @@ -use std::cmp::min; -#[allow(deprecated)] -use std::mem::uninitialized; -use std::ptr::{copy_nonoverlapping, read_unaligned}; - /// Generate a random frame mask. #[inline] pub fn generate_mask() -> [u8; 4] { @@ -17,7 +12,6 @@ pub fn apply_mask(buf: &mut [u8], mask: [u8; 4]) { /// A safe unoptimized mask application. #[inline] -#[allow(dead_code)] fn apply_mask_fallback(buf: &mut [u8], mask: [u8; 4]) { for (i, byte) in buf.iter_mut().enumerate() { *byte ^= mask[i & 3]; @@ -26,21 +20,13 @@ fn apply_mask_fallback(buf: &mut [u8], mask: [u8; 4]) { /// Faster version of `apply_mask()` which operates on 4-byte blocks. #[inline] -#[allow(dead_code, clippy::cast_ptr_alignment)] -fn apply_mask_fast32(buf: &mut [u8], mask: [u8; 4]) { - let mask_u32: u32 = unsafe { read_unaligned(mask.as_ptr() as *const u32) }; - - let mut ptr = buf.as_mut_ptr(); - let mut len = buf.len(); +pub fn apply_mask_fast32(buf: &mut [u8], mask: [u8; 4]) { + let mask_u32 = u32::from_ne_bytes(mask); - // Possible first unaligned block. - let head = min(len, (4 - (ptr as usize & 3)) & 3); + let (mut prefix, words, mut suffix) = unsafe { buf.align_to_mut::() }; + apply_mask_fallback(&mut prefix, mask); + let head = prefix.len() & 3; let mask_u32 = if head > 0 { - unsafe { - xor_mem(ptr, mask_u32, head); - ptr = ptr.add(head); - } - len -= head; if cfg!(target_endian = "big") { mask_u32.rotate_left(8 * head as u32) } else { @@ -49,39 +35,10 @@ fn apply_mask_fast32(buf: &mut [u8], mask: [u8; 4]) { } else { mask_u32 }; - - if len > 0 { - debug_assert_eq!(ptr as usize % 4, 0); - } - - // Properly aligned middle of the data. - while len > 4 { - unsafe { - *(ptr as *mut u32) ^= mask_u32; - ptr = ptr.offset(4); - len -= 4; - } + for word in words.iter_mut() { + *word ^= mask_u32; } - - // Possible last block. - if len > 0 { - unsafe { - xor_mem(ptr, mask_u32, len); - } - } -} - -#[inline] -// TODO: copy_nonoverlapping here compiles to call memcpy. While it is not so inefficient, -// it could be done better. The compiler does not see that len is limited to 3. -unsafe fn xor_mem(ptr: *mut u8, mask: u32, len: usize) { - #[allow(deprecated)] - let mut b: u32 = uninitialized(); - #[allow(trivial_casts)] - copy_nonoverlapping(ptr, &mut b as *mut _ as *mut u8, len); - b ^= mask; - #[allow(trivial_casts)] - copy_nonoverlapping(&b as *const _ as *const u8, ptr, len); + apply_mask_fallback(&mut suffix, mask_u32.to_ne_bytes()); } #[cfg(test)]