#![feature(test)] extern crate aho_corasick; extern crate test; use std::iter; use aho_corasick::{Automaton, AcAutomaton, Transitions}; use test::Bencher; const HAYSTACK_RANDOM: &'static str = include_str!("random.txt"); const HAYSTACK_SHERLOCK: &'static str = include_str!("sherlock.txt"); fn bench_aut_no_match, T: Transitions>( b: &mut Bencher, aut: AcAutomaton, haystack: &str, ) { b.bytes = haystack.len() as u64; b.iter(|| assert!(aut.find(haystack).next().is_none())); } fn bench_box_aut_no_match, T: Transitions>( b: &mut Bencher, aut: AcAutomaton, haystack: &str, ) { b.bytes = haystack.len() as u64; let aut: &Automaton

= &aut; b.iter(|| assert!(Automaton::find(&aut, haystack).next().is_none())); } fn bench_full_aut_no_match, T: Transitions>( b: &mut Bencher, aut: AcAutomaton, haystack: &str, ) { let aut = aut.into_full(); b.bytes = haystack.len() as u64; b.iter(|| assert!(aut.find(haystack).next().is_none())); } fn bench_full_aut_overlapping_no_match, T: Transitions>( b: &mut Bencher, aut: AcAutomaton, haystack: &str, ) { let aut = aut.into_full(); b.bytes = haystack.len() as u64; b.iter(|| assert!(aut.find_overlapping(haystack).count() == 0)); } fn bench_naive_no_match(b: &mut Bencher, needles: Vec, haystack: &str) where S: Into { b.bytes = haystack.len() as u64; let needles: Vec = needles.into_iter().map(Into::into).collect(); b.iter(|| assert!(!naive_find(&needles, haystack))); } #[bench] fn bench_construction(b: &mut Bencher) { b.iter(|| { AcAutomaton::new(test::black_box( [ "ADL", "ADl", "AdL", "Adl", "BAK", "BAk", "BAK", "BaK", "Bak", "BaK", "HOL", "HOl", "HoL", "Hol", "IRE", "IRe", "IrE", "Ire", "JOH", "JOh", "JoH", "Joh", "SHE", "SHe", "ShE", "She", "WAT", "WAt", "WaT", "Wat", "aDL", "aDl", "adL", "adl", "bAK", "bAk", "bAK", "baK", "bak", "baK", "hOL", "hOl", "hoL", "hol", "iRE", "iRe", "irE", "ire", "jOH", "jOh", "joH", "joh", "sHE", "sHe", "shE", "she", "wAT", "wAt", "waT", "wat", "ſHE", "ſHe", "ſhE", "ſhe", ].iter() .map(|x| *x), )) }) } #[bench] fn bench_full_construction(b: &mut Bencher) { b.iter(|| { AcAutomaton::new(test::black_box( [ "ADL", "ADl", "AdL", "Adl", "BAK", "BAk", "BAK", "BaK", "Bak", "BaK", "HOL", "HOl", "HoL", "Hol", "IRE", "IRe", "IrE", "Ire", "JOH", "JOh", "JoH", "Joh", "SHE", "SHe", "ShE", "She", "WAT", "WAt", "WaT", "Wat", "aDL", "aDl", "adL", "adl", "bAK", "bAk", "bAK", "baK", "bak", "baK", "hOL", "hOl", "hoL", "hol", "iRE", "iRe", "irE", "ire", "jOH", "jOh", "joH", "joh", "sHE", "sHe", "shE", "she", "wAT", "wAt", "waT", "wat", "ſHE", "ſHe", "ſhE", "ſhe", ].iter() .map(|x| *x), )).into_full() }) } fn haystack_same(letter: char) -> String { iter::repeat(letter).take(10000).collect() } macro_rules! aut_benches { ($prefix:ident, $aut:expr, $bench:expr) => { mod $prefix { #![allow(unused_imports)] use aho_corasick::{Automaton, AcAutomaton, Sparse}; use test::Bencher; use super::{ HAYSTACK_RANDOM, haystack_same, bench_aut_no_match, bench_box_aut_no_match, bench_full_aut_no_match, bench_full_aut_overlapping_no_match, }; #[bench] fn ac_one_byte(b: &mut Bencher) { let aut = $aut(vec!["a"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_one_prefix_byte_no_match(b: &mut Bencher) { let aut = $aut(vec!["zbc"]); $bench(b, aut, &haystack_same('y')); } #[bench] fn ac_one_prefix_byte_every_match(b: &mut Bencher) { // We lose the benefit of `memchr` because the first byte matches // in every position in the haystack. let aut = $aut(vec!["zbc"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_one_prefix_byte_random(b: &mut Bencher) { let aut = $aut(vec!["zbc\x00"]); $bench(b, aut, HAYSTACK_RANDOM); } #[bench] fn ac_two_bytes(b: &mut Bencher) { let aut = $aut(vec!["a", "b"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_two_diff_prefix(b: &mut Bencher) { let aut = $aut(vec!["abcdef", "bmnopq"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_two_one_prefix_byte_every_match(b: &mut Bencher) { let aut = $aut(vec!["zbcdef", "zmnopq"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_two_one_prefix_byte_no_match(b: &mut Bencher) { let aut = $aut(vec!["zbcdef", "zmnopq"]); $bench(b, aut, &haystack_same('y')); } #[bench] fn ac_two_one_prefix_byte_random(b: &mut Bencher) { let aut = $aut(vec!["zbcdef\x00", "zmnopq\x00"]); $bench(b, aut, HAYSTACK_RANDOM); } #[bench] fn ac_ten_bytes(b: &mut Bencher) { let aut = $aut(vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_ten_diff_prefix(b: &mut Bencher) { let aut = $aut(vec!["abcdef", "bbcdef", "cbcdef", "dbcdef", "ebcdef", "fbcdef", "gbcdef", "hbcdef", "ibcdef", "jbcdef"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_ten_one_prefix_byte_every_match(b: &mut Bencher) { let aut = $aut(vec!["zacdef", "zbcdef", "zccdef", "zdcdef", "zecdef", "zfcdef", "zgcdef", "zhcdef", "zicdef", "zjcdef"]); $bench(b, aut, &haystack_same('z')); } #[bench] fn ac_ten_one_prefix_byte_no_match(b: &mut Bencher) { let aut = $aut(vec!["zacdef", "zbcdef", "zccdef", "zdcdef", "zecdef", "zfcdef", "zgcdef", "zhcdef", "zicdef", "zjcdef"]); $bench(b, aut, &haystack_same('y')); } #[bench] fn ac_ten_one_prefix_byte_random(b: &mut Bencher) { let aut = $aut(vec!["zacdef\x00", "zbcdef\x00", "zccdef\x00", "zdcdef\x00", "zecdef\x00", "zfcdef\x00", "zgcdef\x00", "zhcdef\x00", "zicdef\x00", "zjcdef\x00"]); $bench(b, aut, HAYSTACK_RANDOM); } } } } aut_benches!(dense, AcAutomaton::new, bench_aut_no_match); aut_benches!(dense_boxed, AcAutomaton::new, bench_box_aut_no_match); aut_benches!(sparse, AcAutomaton::<&str, Sparse>::with_transitions, bench_aut_no_match); aut_benches!(full, AcAutomaton::new, bench_full_aut_no_match); aut_benches!(full_overlap, AcAutomaton::new, bench_full_aut_overlapping_no_match); // A naive multi-pattern search. // We use this to benchmark *throughput*, so it should never match anything. fn naive_find(needles: &[String], haystack: &str) -> bool { for hi in 0..haystack.len() { let rest = &haystack.as_bytes()[hi..]; for needle in needles { let needle = needle.as_bytes(); if needle.len() > rest.len() { continue; } if needle == &rest[..needle.len()] { // should never happen in throughput benchmarks. return true; } } } false } #[bench] fn naive_one_byte(b: &mut Bencher) { bench_naive_no_match(b, vec!["a"], &haystack_same('z')); } #[bench] fn naive_one_prefix_byte_no_match(b: &mut Bencher) { bench_naive_no_match(b, vec!["zbc"], &haystack_same('y')); } #[bench] fn naive_one_prefix_byte_every_match(b: &mut Bencher) { bench_naive_no_match(b, vec!["zbc"], &haystack_same('z')); } #[bench] fn naive_one_prefix_byte_random(b: &mut Bencher) { bench_naive_no_match(b, vec!["zbc\x00"], HAYSTACK_RANDOM); } #[bench] fn naive_two_bytes(b: &mut Bencher) { bench_naive_no_match(b, vec!["a", "b"], &haystack_same('z')); } #[bench] fn naive_two_diff_prefix(b: &mut Bencher) { bench_naive_no_match(b, vec!["abcdef", "bmnopq"], &haystack_same('z')); } #[bench] fn naive_two_one_prefix_byte_every_match(b: &mut Bencher) { bench_naive_no_match(b, vec!["zbcdef", "zmnopq"], &haystack_same('z')); } #[bench] fn naive_two_one_prefix_byte_no_match(b: &mut Bencher) { bench_naive_no_match(b, vec!["zbcdef", "zmnopq"], &haystack_same('y')); } #[bench] fn naive_two_one_prefix_byte_random(b: &mut Bencher) { bench_naive_no_match(b, vec!["zbcdef\x00", "zmnopq\x00"], HAYSTACK_RANDOM); } #[bench] fn naive_ten_bytes(b: &mut Bencher) { let needles = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]; bench_naive_no_match(b, needles, &haystack_same('z')); } #[bench] fn naive_ten_diff_prefix(b: &mut Bencher) { let needles = vec!["abcdef", "bbcdef", "cbcdef", "dbcdef", "ebcdef", "fbcdef", "gbcdef", "hbcdef", "ibcdef", "jbcdef"]; bench_naive_no_match(b, needles, &haystack_same('z')); } #[bench] fn naive_ten_one_prefix_byte_every_match(b: &mut Bencher) { let needles = vec!["zacdef", "zbcdef", "zccdef", "zdcdef", "zecdef", "zfcdef", "zgcdef", "zhcdef", "zicdef", "zjcdef"]; bench_naive_no_match(b, needles, &haystack_same('z')); } #[bench] fn naive_ten_one_prefix_byte_no_match(b: &mut Bencher) { let needles = vec!["zacdef", "zbcdef", "zccdef", "zdcdef", "zecdef", "zfcdef", "zgcdef", "zhcdef", "zicdef", "zjcdef"]; bench_naive_no_match(b, needles, &haystack_same('y')); } #[bench] fn naive_ten_one_prefix_byte_random(b: &mut Bencher) { let needles = vec!["zacdef\x00", "zbcdef\x00", "zccdef\x00", "zdcdef\x00", "zecdef\x00", "zfcdef\x00", "zgcdef\x00", "zhcdef\x00", "zicdef\x00", "zjcdef\x00"]; bench_naive_no_match(b, needles, HAYSTACK_RANDOM); } // The organization above is just awful. Let's start over... mod sherlock { use aho_corasick::{Automaton, AcAutomaton}; use test::Bencher; use super::HAYSTACK_SHERLOCK; macro_rules! sherlock { ($name:ident, $count:expr, $pats:expr) => { #[bench] fn $name(b: &mut Bencher) { let haystack = HAYSTACK_SHERLOCK; let aut = AcAutomaton::new($pats).into_full(); b.bytes = haystack.len() as u64; b.iter(|| assert_eq!($count, aut.find(haystack).count())); } } } sherlock!(name_alt1, 158, vec!["Sherlock", "Street"]); sherlock!(name_alt2, 558, vec!["Sherlock", "Holmes"]); sherlock!(name_alt3, 740, vec![ "Sherlock", "Holmes", "Watson", "Irene", "Adler", "John", "Baker", ]); sherlock!(name_alt3_nocase, 1764, vec![ "ADL", "ADl", "AdL", "Adl", "BAK", "BAk", "BAK", "BaK", "Bak", "BaK", "HOL", "HOl", "HoL", "Hol", "IRE", "IRe", "IrE", "Ire", "JOH", "JOh", "JoH", "Joh", "SHE", "SHe", "ShE", "She", "WAT", "WAt", "WaT", "Wat", "aDL", "aDl", "adL", "adl", "bAK", "bAk", "bAK", "baK", "bak", "baK", "hOL", "hOl", "hoL", "hol", "iRE", "iRe", "irE", "ire", "jOH", "jOh", "joH", "joh", "sHE", "sHe", "shE", "she", "wAT", "wAt", "waT", "wat", "ſHE", "ſHe", "ſhE", "ſhe", ]); sherlock!(name_alt4, 582, vec!["Sher", "Hol"]); sherlock!(name_alt4_nocase, 1307, vec![ "HOL", "HOl", "HoL", "Hol", "SHE", "SHe", "ShE", "She", "hOL", "hOl", "hoL", "hol", "sHE", "sHe", "shE", "she", "ſHE", "ſHe", "ſhE", "ſhe", ]); sherlock!(name_alt5, 639, vec!["Sherlock", "Holmes", "Watson"]); sherlock!(name_alt5_nocase, 1442, vec![ "HOL", "HOl", "HoL", "Hol", "SHE", "SHe", "ShE", "She", "WAT", "WAt", "WaT", "Wat", "hOL", "hOl", "hoL", "hol", "sHE", "sHe", "shE", "she", "wAT", "wAt", "waT", "wat", "ſHE", "ſHe", "ſhE", "ſhe", ]); }