refactoring/apply_mask: reduce number of usage of unsafe to 1

Plus it should be faster now:

apply_mask/fallback/0   time:   [24.282 ns 24.727 ns 25.255 ns]
apply_mask/fast/0       time:   [11.921 ns 11.963 ns 12.011 ns]
apply_mask/fast_safe/0  time:   [7.9340 ns 7.9807 ns 8.0261 ns]

apply_mask/fallback/1   time:   [25.284 ns 25.710 ns 26.124 ns]
apply_mask/fast/1       time:   [20.433 ns 20.476 ns 20.511 ns]
apply_mask/fast_safe/1  time:   [9.3208 ns 9.3833 ns 9.4470 ns]

apply_mask/fallback/2   time:   [16.051 ns 16.160 ns 16.275 ns]
apply_mask/fast/2       time:   [20.564 ns 20.569 ns 20.574 ns]
apply_mask/fast_safe/2  time:   [9.1449 ns 9.1830 ns 9.2189 ns]

apply_mask/fallback/3   time:   [15.386 ns 15.548 ns 15.715 ns]
apply_mask/fast/3       time:   [18.836 ns 18.867 ns 18.917 ns]
apply_mask/fast_safe/3  time:   [8.3092 ns 8.3566 ns 8.4076 ns]
pull/126/head
Evgeniy A. Dushistov 5 years ago
parent 9764ac13a1
commit 5a95f12c6d
  1. 59
      src/protocol/frame/mask.rs

@ -1,8 +1,3 @@
use std::cmp::min;
#[allow(deprecated)]
use std::mem::uninitialized;
use std::ptr::{copy_nonoverlapping, read_unaligned};
/// Generate a random frame mask.
#[inline]
pub fn generate_mask() -> [u8; 4] {
@ -17,7 +12,6 @@ pub fn apply_mask(buf: &mut [u8], mask: [u8; 4]) {
/// A safe unoptimized mask application.
#[inline]
#[allow(dead_code)]
fn apply_mask_fallback(buf: &mut [u8], mask: [u8; 4]) {
for (i, byte) in buf.iter_mut().enumerate() {
*byte ^= mask[i & 3];
@ -26,21 +20,13 @@ fn apply_mask_fallback(buf: &mut [u8], mask: [u8; 4]) {
/// Faster version of `apply_mask()` which operates on 4-byte blocks.
#[inline]
#[allow(dead_code, clippy::cast_ptr_alignment)]
fn apply_mask_fast32(buf: &mut [u8], mask: [u8; 4]) {
let mask_u32: u32 = unsafe { read_unaligned(mask.as_ptr() as *const u32) };
let mut ptr = buf.as_mut_ptr();
let mut len = buf.len();
pub fn apply_mask_fast32(buf: &mut [u8], mask: [u8; 4]) {
let mask_u32 = u32::from_ne_bytes(mask);
// Possible first unaligned block.
let head = min(len, (4 - (ptr as usize & 3)) & 3);
let (mut prefix, words, mut suffix) = unsafe { buf.align_to_mut::<u32>() };
apply_mask_fallback(&mut prefix, mask);
let head = prefix.len() & 3;
let mask_u32 = if head > 0 {
unsafe {
xor_mem(ptr, mask_u32, head);
ptr = ptr.add(head);
}
len -= head;
if cfg!(target_endian = "big") {
mask_u32.rotate_left(8 * head as u32)
} else {
@ -49,39 +35,10 @@ fn apply_mask_fast32(buf: &mut [u8], mask: [u8; 4]) {
} else {
mask_u32
};
if len > 0 {
debug_assert_eq!(ptr as usize % 4, 0);
}
// Properly aligned middle of the data.
while len > 4 {
unsafe {
*(ptr as *mut u32) ^= mask_u32;
ptr = ptr.offset(4);
len -= 4;
}
for word in words.iter_mut() {
*word ^= mask_u32;
}
// Possible last block.
if len > 0 {
unsafe {
xor_mem(ptr, mask_u32, len);
}
}
}
#[inline]
// TODO: copy_nonoverlapping here compiles to call memcpy. While it is not so inefficient,
// it could be done better. The compiler does not see that len is limited to 3.
unsafe fn xor_mem(ptr: *mut u8, mask: u32, len: usize) {
#[allow(deprecated)]
let mut b: u32 = uninitialized();
#[allow(trivial_casts)]
copy_nonoverlapping(ptr, &mut b as *mut _ as *mut u8, len);
b ^= mask;
#[allow(trivial_casts)]
copy_nonoverlapping(&b as *const _ as *const u8, ptr, len);
apply_mask_fallback(&mut suffix, mask_u32.to_ne_bytes());
}
#[cfg(test)]

Loading…
Cancel
Save