From ea4b5e4df0fc5f67539347066c9cc502ceada28a Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:36 +0300 Subject: [PATCH 1/9] Remove unnecessary transmute in generate_mask() rand::Rand has impl Rand for [T; 16] where T: Rand so we don't need to simulate it ourselves. --- src/protocol/frame/frame.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/protocol/frame/frame.rs b/src/protocol/frame/frame.rs index 3992bc5..3719e25 100644 --- a/src/protocol/frame/frame.rs +++ b/src/protocol/frame/frame.rs @@ -22,7 +22,7 @@ fn apply_mask(buf: &mut [u8], mask: &[u8; 4]) { #[inline] fn generate_mask() -> [u8; 4] { - unsafe { transmute(rand::random::()) } + rand::random() } /// A struct representing a WebSocket frame. From dd96d3b9d4d6a6999d3b198659c57991351605ac Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:38 +0300 Subject: [PATCH 2/9] Speed up apply_mask() This function is the most speed-critical in the library. In profiles, this optimization reduces it from ~75% of the profile to ~55%. I have tried several approaches, but didn't manage to improve on this one (LLVM already unrolls the loop here). Though I'm sure it is possible. --- src/protocol/frame/frame.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/protocol/frame/frame.rs b/src/protocol/frame/frame.rs index 3719e25..de9ef59 100644 --- a/src/protocol/frame/frame.rs +++ b/src/protocol/frame/frame.rs @@ -14,9 +14,8 @@ use error::{Error, Result}; use super::coding::{OpCode, Control, Data, CloseCode}; fn apply_mask(buf: &mut [u8], mask: &[u8; 4]) { - let iter = buf.iter_mut().zip(mask.iter().cycle()); - for (byte, &key) in iter { - *byte ^= key + for (i, byte) in buf.iter_mut().enumerate() { + *byte ^= mask[i & 3]; } } From 3c600aa13896cb741c63206f92e2650c66ac8def Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:39 +0300 Subject: [PATCH 3/9] Speed up apply_mask() - part 2 This is using some unsafe code and assumptions about alignment to speed up apply_mask() more. In profiles, this optimization reduces it from ~55% of the total runtime to ~7%. --- src/protocol/frame/frame.rs | 48 +++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/protocol/frame/frame.rs b/src/protocol/frame/frame.rs index de9ef59..f3a81e8 100644 --- a/src/protocol/frame/frame.rs +++ b/src/protocol/frame/frame.rs @@ -19,6 +19,26 @@ fn apply_mask(buf: &mut [u8], mask: &[u8; 4]) { } } +/// Faster version of `apply_mask()` which operates on 4-byte blocks. +/// +/// Safety: `buf` must be at least 4-bytes aligned. +unsafe fn apply_mask_aligned32(buf: &mut [u8], mask: &[u8; 4]) { + debug_assert_eq!(buf.as_ptr() as usize % 4, 0); + + let mask_u32 = transmute(*mask); + + let mut ptr = buf.as_mut_ptr() as *mut u32; + for _ in 0..(buf.len() / 4) { + *ptr ^= mask_u32; + ptr = ptr.offset(1); + } + + // Possible last block with less than 4 bytes. + let last_block_start = buf.len() & !3; + let last_block = &mut buf[last_block_start..]; + apply_mask(last_block, mask); +} + #[inline] fn generate_mask() -> [u8; 4] { rand::random() @@ -174,7 +194,10 @@ impl Frame { #[inline] pub fn remove_mask(&mut self) { self.mask.and_then(|mask| { - Some(apply_mask(&mut self.payload, &mask)) + // Assumes Vec's backing memory is at least 4-bytes aligned. + unsafe { + Some(apply_mask_aligned32(&mut self.payload, &mask)) + } }); self.mask = None; } @@ -435,7 +458,10 @@ impl Frame { if self.is_masked() { let mask = self.mask.take().unwrap(); - apply_mask(&mut self.payload, &mask); + // Assumes Vec's backing memory is at least 4-bytes aligned. + unsafe { + apply_mask_aligned32(&mut self.payload, &mask); + } try!(w.write(&mask)); } @@ -489,6 +515,24 @@ mod tests { use super::super::coding::{OpCode, Data}; use std::io::Cursor; + #[test] + fn test_apply_mask() { + let mask = [ + 0x6d, 0xb6, 0xb2, 0x80, + ]; + let unmasked = vec![ + 0xf3, 0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x82, 0xff, 0xfe, 0x00, + ]; + + let mut masked = unmasked.clone(); + apply_mask(&mut masked, &mask); + + let mut masked_aligned = unmasked.clone(); + unsafe { apply_mask_aligned32(&mut masked_aligned, &mask) }; + + assert_eq!(masked, masked_aligned); + } + #[test] fn parse() { let mut raw: Cursor> = Cursor::new(vec![ From 76e80ca9a317b38b820e5bfe67a9f29cab104918 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:40 +0300 Subject: [PATCH 4/9] Only allocate error message if the error occurred Suggested by clippy: warning: use of `ok_or` followed by a function call --> src/handshake/server.rs:20:19 | 20 | let key = self.headers.find_first("Sec-WebSocket-Key") | ___________________^ starting here... 21 | | .ok_or(Error::Protocol("Missing Sec-WebSocket-Key".into()))?; | |_______________________________________________________________________^ ...ending here | = note: #[warn(or_fun_call)] on by default help: try this | let key = self.headers.find_first("Sec-WebSocket-Key").ok_or_else(|| Error::Protocol("Missing Sec-WebSocket-Key".into()))?; = help: for further information visit https://github.com/Manishearth/rust-clippy/wiki#or_fun_call --- src/client.rs | 2 +- src/handshake/server.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/client.rs b/src/client.rs index d7b3f1f..63e4d8f 100644 --- a/src/client.rs +++ b/src/client.rs @@ -73,7 +73,7 @@ fn wrap_stream(stream: TcpStream, _domain: &str, mode: Mode) -> Result(addrs: A, url: &Url, mode: Mode) -> Result where A: Iterator { - let domain = url.host_str().ok_or(Error::Url("No host name in the URL".into()))?; + let domain = url.host_str().ok_or_else(|| Error::Url("No host name in the URL".into()))?; for addr in addrs { debug!("Trying to contact {} at {}...", url, addr); if let Ok(raw_stream) = TcpStream::connect(addr) { diff --git a/src/handshake/server.rs b/src/handshake/server.rs index d389fd3..fbd08b6 100644 --- a/src/handshake/server.rs +++ b/src/handshake/server.rs @@ -18,7 +18,7 @@ impl Request { /// Reply to the response. pub fn reply(&self) -> Result> { let key = self.headers.find_first("Sec-WebSocket-Key") - .ok_or(Error::Protocol("Missing Sec-WebSocket-Key".into()))?; + .ok_or_else(|| Error::Protocol("Missing Sec-WebSocket-Key".into()))?; let reply = format!("\ HTTP/1.1 101 Switching Protocols\r\n\ Connection: Upgrade\r\n\ From 8955b55e62f215ae0b6da10b4d02ae7dbcc6f9c9 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:40 +0300 Subject: [PATCH 5/9] Simplify concatenation code using slice::concat --- src/protocol/frame/frame.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/protocol/frame/frame.rs b/src/protocol/frame/frame.rs index f3a81e8..6fbcf12 100644 --- a/src/protocol/frame/frame.rs +++ b/src/protocol/frame/frame.rs @@ -2,7 +2,6 @@ use std::fmt; use std::mem::transmute; use std::io::{Cursor, Read, Write}; use std::default::Default; -use std::iter::FromIterator; use std::string::{String, FromUtf8Error}; use std::result::Result as StdResult; use byteorder::{ByteOrder, NetworkEndian}; @@ -274,10 +273,7 @@ impl Frame { let u: u16 = code.into(); transmute(u.to_be()) }; - Vec::from_iter( - raw[..].iter() - .chain(reason.as_bytes().iter()) - .map(|&b| b)) + [&raw[..], reason.as_bytes()].concat() } else { Vec::new() }; From ae30b8cd76866e99de05fac16001c4a8e43de0a6 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:41 +0300 Subject: [PATCH 6/9] Apply a couple trivial clippy suggestions --- src/client.rs | 2 +- src/protocol/mod.rs | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/client.rs b/src/client.rs index 63e4d8f..8d98600 100644 --- a/src/client.rs +++ b/src/client.rs @@ -57,7 +57,7 @@ fn wrap_stream(stream: TcpStream, domain: &str, mode: Mode) -> Result f.into(), TlsHandshakeError::Interrupted(_) => panic!("Bug: TLS handshake not blocked"), }) - .map(|s| StreamSwitcher::Tls(s)) + .map(StreamSwitcher::Tls) } } } diff --git a/src/protocol/mod.rs b/src/protocol/mod.rs index 970d99e..b03d954 100644 --- a/src/protocol/mod.rs +++ b/src/protocol/mod.rs @@ -124,15 +124,12 @@ impl WebSocket { /// This function guarantees that the close frame will be queued. /// There is no need to call it again, just like write_message(). pub fn close(&mut self) -> Result<()> { - match self.state { - WebSocketState::Active => { - self.state = WebSocketState::ClosedByUs; - let frame = Frame::close(None); - self.send_queue.push_back(frame); - } - _ => { - // already closed, nothing to do - } + if let WebSocketState::Active = self.state { + self.state = WebSocketState::ClosedByUs; + let frame = Frame::close(None); + self.send_queue.push_back(frame); + } else { + // Already closed, nothing to do. } self.write_pending() } From b5b9e77b03b2686afc513254b52194589cfe3e88 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 20:41:11 +0300 Subject: [PATCH 7/9] Make env_logger a dev dependency It is only used by the examples; libraries should only depend on the `log` crate. --- Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 571a616..0238bd6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,6 @@ base64 = "0.4.0" byteorder = "1.0.0" bytes = "0.4.1" httparse = "1.2.1" -env_logger = "0.4.2" log = "0.3.7" rand = "0.3.15" sha1 = "0.2.0" @@ -30,3 +29,6 @@ utf-8 = "0.7.0" [dependencies.native-tls] optional = true version = "0.1.1" + +[dev-dependencies] +env_logger = "0.4.2" From be834ac26192f1342b386ec5b66f3044949b827f Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Sat, 25 Mar 2017 18:01:37 +0300 Subject: [PATCH 8/9] Replace unsafe endianness code with byteorder functions --- src/protocol/frame/frame.rs | 57 ++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/src/protocol/frame/frame.rs b/src/protocol/frame/frame.rs index 6fbcf12..d41cde8 100644 --- a/src/protocol/frame/frame.rs +++ b/src/protocol/frame/frame.rs @@ -1,10 +1,10 @@ use std::fmt; use std::mem::transmute; -use std::io::{Cursor, Read, Write}; +use std::io::{Cursor, Read, Write, ErrorKind}; use std::default::Default; use std::string::{String, FromUtf8Error}; use std::result::Result as StdResult; -use byteorder::{ByteOrder, NetworkEndian}; +use byteorder::{ByteOrder, ReadBytesExt, NetworkEndian}; use bytes::BufMut; use rand; @@ -319,29 +319,24 @@ impl Frame { let mut length = (second & 0x7F) as u64; - if length == 126 { - let mut length_bytes = [0u8; 2]; - if try!(cursor.read(&mut length_bytes)) != 2 { - cursor.set_position(initial); - return Ok(None) - } - - length = unsafe { - let mut wide: u16 = transmute(length_bytes); - wide = u16::from_be(wide); - wide - } as u64; - header_length += 2; - } else if length == 127 { - let mut length_bytes = [0u8; 8]; - if try!(cursor.read(&mut length_bytes)) != 8 { - cursor.set_position(initial); - return Ok(None) - } - - unsafe { length = transmute(length_bytes); } - length = u64::from_be(length); - header_length += 8; + if let Some(length_nbytes) = match length { + 126 => Some(2), + 127 => Some(8), + _ => None, + } { + match cursor.read_uint::(length_nbytes) { + Err(ref err) if err.kind() == ErrorKind::UnexpectedEof => { + cursor.set_position(initial); + return Ok(None); + } + Err(err) => { + return Err(Error::from(err)); + } + Ok(read) => { + length = read; + } + }; + header_length += length_nbytes as u64; } trace!("Payload length: {}", length); @@ -425,18 +420,14 @@ impl Frame { try!(w.write(&headers)); } else if self.payload.len() <= 65535 { two |= 126; - let length_bytes: [u8; 2] = unsafe { - let short = self.payload.len() as u16; - transmute(short.to_be()) - }; + let mut length_bytes = [0u8; 2]; + NetworkEndian::write_u16(&mut length_bytes, self.payload.len() as u16); let headers = [one, two, length_bytes[0], length_bytes[1]]; try!(w.write(&headers)); } else { two |= 127; - let length_bytes: [u8; 8] = unsafe { - let long = self.payload.len() as u64; - transmute(long.to_be()) - }; + let mut length_bytes = [0u8; 8]; + NetworkEndian::write_u64(&mut length_bytes, self.payload.len() as u64); let headers = [ one, two, From 8e49bea00fea274cd48a17cb848000186e5ee7e9 Mon Sep 17 00:00:00 2001 From: Ran Benita Date: Mon, 27 Mar 2017 20:33:09 +0300 Subject: [PATCH 9/9] Fix InputBuffer::reserve() to not be a no-op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current implementation uses the `remaining_mut()` function from the bytes::BufMut implementation for Vec. In terms of the BufMut trait, a Vec buffer has infinite capacity - you can always write more to the buffer, since the Vec grows as needed. Hence, the `remaining_mut` here actually returns +∞ (actually, `usize::MAX - len`). So the first `if` is always true, and the calls to `reserve` never actually allocate the appropriate space. What happens instead is that the `bytes_mut()` call in `read_from` picks up the slack, but it merely grows the buffer a 64 bytes at a time, which slows things down. This changes the check to use the Vec capacity instead of `remaining_mut`. In my profile (sending 100,000 10KiB messages back and forth), this reduces `__memmove_avx_unaligned_erms`'s share of the total runtime from ~77% to ~53%. --- src/input_buffer.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/input_buffer.rs b/src/input_buffer.rs index e08e704..c2a87c0 100644 --- a/src/input_buffer.rs +++ b/src/input_buffer.rs @@ -23,7 +23,8 @@ impl InputBuffer { /// Reserve the given amount of space. pub fn reserve(&mut self, space: usize, limit: usize) -> Result<(), SizeLimit>{ - if self.inp_mut().remaining_mut() >= space { + let remaining = self.inp_mut().capacity() - self.inp_mut().len(); + if remaining >= space { // We have enough space right now. Ok(()) } else {