use std::cmp; use std::io; use memchr::{memchr, memrchr}; use regex::bytes::{Regex, RegexBuilder}; use syntax; use literals::LiteralSets; use nonl; use {Error, Result}; #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct Match { start: usize, end: usize, line: Option, locations: Vec<(usize, usize)>, } impl Match { pub fn new() -> Match { Match::default() } /// Return the starting byte offset of the line that matched. #[inline] pub fn start(&self) -> usize { self.start } /// Return the ending byte offset of the line that matched. #[inline] pub fn end(&self) -> usize { self.end } /// Return the line number that this match corresponds to. /// /// Note that this is `None` if line numbers aren't being computed. Line /// number tracking can be enabled using `GrepBuilder`. #[inline] pub fn line(&self) -> Option { self.line } /// Return the exact start and end locations (in byte offsets) of every /// regex match in this line. /// /// Note that this always returns an empty slice if exact locations aren't /// computed. Exact location tracking can be enabled using `GrepBuilder`. #[inline] pub fn locations(&self) -> &[(usize, usize)] { &self.locations } } #[derive(Clone, Debug)] pub struct Grep { re: Regex, required: Option, opts: Options, } #[derive(Clone, Debug)] pub struct GrepBuilder { pattern: String, opts: Options, } #[derive(Clone, Debug)] struct Options { case_insensitive: bool, lines: bool, locations: bool, line_terminator: u8, size_limit: usize, dfa_size_limit: usize, } impl Default for Options { fn default() -> Options { Options { case_insensitive: false, lines: false, locations: false, line_terminator: b'\n', size_limit: 10 * (1 << 20), dfa_size_limit: 10 * (1 << 20), } } } impl GrepBuilder { /// Create a new builder for line searching. /// /// The pattern given should be a regular expression. The precise syntax /// supported is documented on the regex crate. pub fn new(pattern: &str) -> GrepBuilder { GrepBuilder { pattern: pattern.to_string(), opts: Options::default(), } } /// Sets whether line numbers are reported for each match. /// /// When enabled (disabled by default), every matching line is tagged with /// its corresponding line number according to the line terminator that is /// set. Note that this requires extra processing which can slow down /// search. pub fn line_numbers(mut self, yes: bool) -> GrepBuilder { self.opts.lines = yes; self } /// Set whether precise match locations are reported for each matching /// line. /// /// When enabled (disabled by default), every match of the regex on each /// matchling line is reported via byte offsets. Note that this requires /// extra processing which can slow down search. pub fn locations(mut self, yes: bool) -> GrepBuilder { self.opts.locations = yes; self } /// Set the line terminator. /// /// The line terminator can be any ASCII character and serves to delineate /// the match boundaries in the text searched. /// /// This panics if `ascii_byte` is greater than `0x7F` (i.e., not ASCII). pub fn line_terminator(mut self, ascii_byte: u8) -> GrepBuilder { assert!(ascii_byte <= 0x7F); self.opts.line_terminator = ascii_byte; self } /// Set the case sensitive flag (`i`) on the regex. pub fn case_insensitive(mut self, yes: bool) -> GrepBuilder { self.opts.case_insensitive = yes; self } /// Set the approximate size limit of the compiled regular expression. /// /// This roughly corresponds to the number of bytes occupied by a /// single compiled program. If the program exceeds this number, then a /// compilation error is returned. pub fn size_limit(mut self, limit: usize) -> GrepBuilder { self.opts.size_limit = limit; self } /// Set the approximate size of the cache used by the DFA. /// /// This roughly corresponds to the number of bytes that the DFA will use /// while searching. /// /// Note that this is a per thread limit. There is no way to set a global /// limit. In particular, if a regex is used from multiple threads /// simulanteously, then each thread may use up to the number of bytes /// specified here. pub fn dfa_size_limit(mut self, limit: usize) -> GrepBuilder { self.opts.dfa_size_limit = limit; self } /// Create a line searcher. /// /// If there was a problem parsing or compiling the regex with the given /// options, then an error is returned. pub fn create(self) -> Result { let expr = try!(self.parse()); let literals = LiteralSets::create(&expr); let re = try!( RegexBuilder::new(&expr.to_string()) .case_insensitive(self.opts.case_insensitive) .multi_line(true) .unicode(true) .size_limit(self.opts.size_limit) .dfa_size_limit(self.opts.dfa_size_limit) .compile() ); Ok(Grep { re: re, required: literals.to_regex(), opts: self.opts, }) } /// Parses the underlying pattern and ensures the pattern can never match /// the line terminator. fn parse(&self) -> Result { let expr = try!(syntax::ExprBuilder::new() .allow_bytes(true) .unicode(true) .parse(&self.pattern)); Ok(try!(nonl::remove(expr, self.opts.line_terminator))) } } impl Grep { pub fn iter<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> { Iter { searcher: self, buf: buf, start: 0, } } pub fn buffered_reader<'g, R: io::Read>( &'g self, buf: Buffer, rdr: R, ) -> GrepBuffered<'g, R> { GrepBuffered { grep: self, rdr: rdr, b: buf, pos: 0, start: 0, lastnl: 0, end: 0, } } pub fn read_match( &self, mat: &mut Match, buf: &[u8], mut start: usize, ) -> bool { if start >= buf.len() { return false; } if let Some(ref req) = self.required { while start < buf.len() { let e = match req.shortest_match(&buf[start..]) { None => return false, Some(e) => start + e, }; let (prevnl, nextnl) = self.find_line(buf, e, e); match self.re.shortest_match(&buf[prevnl..nextnl]) { None => { start = nextnl + 1; continue; } Some(_) => { self.fill_match(mat, prevnl, nextnl); return true; } } } false } else { let e = match self.re.shortest_match(&buf[start..]) { None => return false, Some(e) => start + e, }; let (s, e) = self.find_line(buf, e, e); self.fill_match(mat, s, e); true } } fn fill_match(&self, mat: &mut Match, start: usize, end: usize) { mat.start = start; mat.end = end; } fn find_line(&self, buf: &[u8], s: usize, e: usize) -> (usize, usize) { (self.find_line_start(buf, s), self.find_line_end(buf, e)) } fn find_line_start(&self, buf: &[u8], pos: usize) -> usize { memrchr(self.opts.line_terminator, &buf[0..pos]).map_or(0, |i| i + 1) } fn find_line_end(&self, buf: &[u8], pos: usize) -> usize { memchr(self.opts.line_terminator, &buf[pos..]) .map_or(buf.len(), |i| pos + i) } } pub struct Buffer { buf: Vec, tmp: Vec, } impl Buffer { pub fn new() -> Buffer { Buffer::with_capacity(16 * (1<<10)) } pub fn with_capacity(cap: usize) -> Buffer { Buffer { buf: vec![0; cap], tmp: Vec::new(), } } } pub struct GrepBuffered<'g, R> { grep: &'g Grep, rdr: R, b: Buffer, pos: usize, start: usize, lastnl: usize, end: usize, } impl<'g, R: io::Read> GrepBuffered<'g, R> { pub fn into_buffer(self) -> Buffer { self.b } pub fn iter<'b>(&'b mut self) -> IterBuffered<'b, 'g, R> { IterBuffered { grep: self } } pub fn read_match( &mut self, mat: &mut Match, ) -> Result { loop { // If the starting position is equal to the end of the last search, // then it's time to refill the buffer for more searching. if self.start == self.lastnl { if !try!(self.fill()) { return Ok(false); } } let ok = self.grep.read_match( mat, &self.b.buf[..self.lastnl], self.start); if !ok { // This causes the next iteration to refill the buffer with // more bytes to search. self.start = self.lastnl; continue; } // Move start to the first possible byte of the next line. self.start = cmp::min( self.lastnl, mat.end.checked_add(1).unwrap()); mat.start += self.pos; mat.end += self.pos; return Ok(true); } } fn fill(&mut self) -> Result { { // The buffer might have leftover bytes that have not been // searched yet. Leftovers correspond to all bytes proceding the // final \n in the current buffer. // // TODO(ag): Seems like we should be able to memmove from the end // of the buffer to the beginning, but let's do it the stupid (but // safe) way for now. let leftovers = &self.b.buf[self.lastnl..self.end]; self.b.tmp.clear(); self.b.tmp.resize(leftovers.len(), 0); self.b.tmp.copy_from_slice(leftovers); } // Move the leftovers to the beginning of our buffer. self.b.buf[0..self.b.tmp.len()].copy_from_slice(&self.b.tmp); // Fill the rest with fresh bytes. let nread = try!(self.rdr.read(&mut self.b.buf[self.b.tmp.len()..])); // Now update our position in all of the bytes searched. self.pos += self.start; self.start = 0; // The end is the total number of bytes read plus whatever we had for // leftovers. self.end = self.b.tmp.len() + nread; // Find the last new line. All searches on this buffer will be capped // at this position since any proceding bytes may correspond to a // partial line. // // This is a little complicated because we must handle the case where // the buffer is not full and no new line character could be found. // We detect this case because this could potentially be a partial // line. If we fill our buffer and still can't find a `\n`, then we // give up. let mut start = 0; let term = self.grep.opts.line_terminator; loop { match memrchr(term, &self.b.buf[start..self.end]) { Some(i) => { self.lastnl = start + i + 1; break; } None => { // If we couldn't find a new line and our buffer is // completely full, then this line is terribly long and we // return an error. if self.end == self.b.buf.len() { return Err(Error::LineTooLong(self.b.buf.len())); } // Otherwise we try to ask for more bytes and look again. let nread = try!( self.rdr.read(&mut self.b.buf[self.end..])); // If we got nothing then we're at EOF and we no longer // need to care about leftovers. if nread == 0 { self.lastnl = self.end; break; } start = self.end; self.end += nread; } } } // If end is zero, then we've hit EOF and we have no leftovers. Ok(self.end > 0) } } pub struct Iter<'b, 's> { searcher: &'s Grep, buf: &'b [u8], start: usize, } impl<'b, 's> Iterator for Iter<'b, 's> { type Item = Match; fn next(&mut self) -> Option { let mut mat = Match::default(); if !self.searcher.read_match(&mut mat, self.buf, self.start) { self.start = self.buf.len(); return None; } self.start = mat.end + 1; Some(mat) } } pub struct IterBuffered<'b, 'g: 'b, R: 'b> { grep: &'b mut GrepBuffered<'g, R>, } impl<'b, 'g, R: io::Read> Iterator for IterBuffered<'b, 'g, R> { type Item = Result; fn next(&mut self) -> Option> { let mut mat = Match::default(); match self.grep.read_match(&mut mat) { Err(err) => Some(Err(err)), Ok(false) => None, Ok(true) => Some(Ok(mat)), } } } #[allow(dead_code)] fn s(bytes: &[u8]) -> String { String::from_utf8(bytes.to_vec()).unwrap() } #[cfg(test)] mod tests { #![allow(unused_imports)] use memchr::{memchr, memrchr}; use regex::bytes::Regex; use super::{Buffer, GrepBuilder, s}; static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt"); fn find_lines(pat: &str, haystack: &[u8]) -> Vec<(usize, usize)> { let re = Regex::new(pat).unwrap(); let mut lines = vec![]; for (s, e) in re.find_iter(haystack) { let start = memrchr(b'\n', &haystack[..s]) .map_or(0, |i| i + 1); let end = memchr(b'\n', &haystack[e..]) .map_or(haystack.len(), |i| e + i); lines.push((start, end)); } lines } fn grep_lines(pat: &str, haystack: &[u8]) -> Vec<(usize, usize)> { let g = GrepBuilder::new(pat).create().unwrap(); let mut bg = g.buffered_reader(Buffer::new(), haystack); bg.iter().map(|r| r.unwrap()).map(|m| (m.start(), m.end())).collect() } #[test] fn buffered_literal() { let expected = find_lines("Sherlock Holmes", SHERLOCK); let got = grep_lines("Sherlock Holmes", SHERLOCK); assert_eq!(expected.len(), got.len()); assert_eq!(expected, got); } }