diff --git a/src/libcore/iter/mod.rs b/src/libcore/iter/mod.rs
index ea98265ef8de0..d9b8c5ea589fd 100644
--- a/src/libcore/iter/mod.rs
+++ b/src/libcore/iter/mod.rs
@@ -1100,6 +1100,17 @@ impl<I: Iterator, P> Iterator for Filter<I, P> where P: FnMut(&I::Item) -> bool
         (0, upper) // can't know a lower bound, due to the predicate
     }
 
+    // this special case allows the compiler to make `.filter(_).count()`
+    // branchless. Barring perfect branch prediction (which is unattainable in
+    // the general case), this will be much faster in >90% of cases (containing
+    // virtually all real workloads) and only a tiny bit slower in the rest.
+    //
+    // Having this specialization thus allows us to write `.filter(p).count()`
+    // where we would otherwise write `.map(|x| p(x) as usize).sum()`, which is
+    // less readable and also less backwards-compatible to Rust before 1.10.
+    //
+    // Using the branchless version will also simplify the LLVM byte code, thus
+    // leaving more budget for LLVM optimizations.
     #[inline]
     fn count(mut self) -> usize {
         let mut count = 0;
diff --git a/src/libcoretest/iter.rs b/src/libcoretest/iter.rs
index 05a674e05d5be..e6d2494f5fda8 100644
--- a/src/libcoretest/iter.rs
+++ b/src/libcoretest/iter.rs
@@ -194,7 +194,7 @@ fn test_iterator_enumerate_count() {
 #[test]
 fn test_iterator_filter_count() {
     let xs = [0, 1, 2, 3, 4, 5, 6, 7, 8];
-    assert_eq!(xs.iter().filter(|x| x % 2 == 0).count(), 5);
+    assert_eq!(xs.iter().filter(|&&x| x % 2 == 0).count(), 5);
 }
 
 #[test]