diff --git a/Cargo.toml b/Cargo.toml index 9ed5768..9e40938 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,7 +30,7 @@ rust-version = "1.85" [[bin]] bench = false path = "crates/core/main.rs" -name = "rg" +name = "rgs" [[test]] name = "integration" diff --git a/crates/core/flags/complete/rg.zsh b/crates/core/flags/complete/rg.zsh index ae9f119..2078c18 100644 --- a/crates/core/flags/complete/rg.zsh +++ b/crates/core/flags/complete/rg.zsh @@ -96,6 +96,8 @@ _rg() { + '(file-name)' # File-name options {-H,--with-filename}'[show file name for matches]' {-I,--no-filename}"[don't show file name for matches]" + '--in-file-index[show per-file match index in output]' + '--no-in-file-index[hide per-file match index in output]' + '(file-system)' # File system options "--one-file-system[don't descend into directories on other file systems]" @@ -210,6 +212,7 @@ _rg() { + '(multiline)' # Multiline options {-U,--multiline}'[permit matching across multiple lines]' + '--multiline-window=[limit multiline matches to NUM lines (with -U)]:number of lines' $no'(multiline-dotall)--no-multiline[restrict matches to at most one line each]' + '(multiline-dotall)' # Multiline DOTALL options diff --git a/crates/core/flags/defs.rs b/crates/core/flags/defs.rs index 5ce9108..9a31ec2 100644 --- a/crates/core/flags/defs.rs +++ b/crates/core/flags/defs.rs @@ -97,6 +97,7 @@ pub(super) const FLAGS: &[&dyn Flag] = &[ &MaxFilesize, &Mmap, &Multiline, + &MultilineWindow, &MultilineDotall, &NoConfig, &NoIgnore, @@ -142,6 +143,7 @@ pub(super) const FLAGS: &[&dyn Flag] = &[ &Unrestricted, &Version, &Vimgrep, + &InFileIndex, &WithFilename, &WithFilenameNo, &WordRegexp, @@ -4209,6 +4211,59 @@ fn test_multiline() { assert_eq!(false, args.multiline); } +/// --multiline-window +#[derive(Debug)] +struct MultilineWindow; + +impl Flag for MultilineWindow { + fn is_switch(&self) -> bool { + false + } + fn name_long(&self) -> &'static str { + "multiline-window" + } + fn doc_variable(&self) -> Option<&'static str> { + Some("NUM") + } + fn doc_category(&self) -> Category { + Category::Search + } + fn doc_short(&self) -> &'static str { + r"Limit multiline matches to a fixed number of lines." + } + fn doc_long(&self) -> &'static str { + r#" +Limit the maximum number of lines that a multiline match may span to +\fINUM\fP (use \fB--multiline-window=\fP\fINUM\fP). +.sp +This flag requires \flag{multiline}. Matches are found as if the file being +searched were limited to \fINUM\fP lines at a time, which can prevent +unintended long matches while still enabling multi-line searching. +.sp +The value of \fINUM\fP must be at least 1. +"# + } + + fn update(&self, v: FlagValue, args: &mut LowArgs) -> anyhow::Result<()> { + let lines = convert::usize(&v.unwrap_value())?; + if lines == 0 { + anyhow::bail!("--multiline-window must be at least 1"); + } + args.multiline_window = Some(lines); + Ok(()) + } +} + +#[cfg(test)] +#[test] +fn test_multiline_window() { + let args = parse_low_raw(None::<&str>).unwrap(); + assert_eq!(None, args.multiline_window); + + let args = parse_low_raw(["--multiline-window=2"]).unwrap(); + assert_eq!(Some(2), args.multiline_window); +} + /// --multiline-dotall #[derive(Debug)] struct MultilineDotall; @@ -7401,6 +7456,53 @@ fn test_vimgrep() { assert_eq!(true, args.vimgrep); } +/// --in-file-index +#[derive(Debug)] +struct InFileIndex; + +impl Flag for InFileIndex { + fn is_switch(&self) -> bool { + true + } + fn name_long(&self) -> &'static str { + "in-file-index" + } + fn name_negated(&self) -> Option<&'static str> { + Some("no-in-file-index") + } + fn doc_category(&self) -> Category { + Category::Output + } + fn doc_short(&self) -> &'static str { + r"Prefix matches with an index per file." + } + fn doc_long(&self) -> &'static str { + r" +When enabled, ripgrep prefixes each matching line with an index that is +incremented per file. The format is \fIFILE\fP[\fIN\fP]:\fILINE\fP:, which can +disambiguate multi-line matches that print the same line multiple times. +" + } + + fn update(&self, v: FlagValue, args: &mut LowArgs) -> anyhow::Result<()> { + args.in_file_index = v.unwrap_switch(); + Ok(()) + } +} + +#[cfg(test)] +#[test] +fn test_in_file_index() { + let args = parse_low_raw(None::<&str>).unwrap(); + assert_eq!(false, args.in_file_index); + + let args = parse_low_raw(["--in-file-index"]).unwrap(); + assert_eq!(true, args.in_file_index); + + let args = parse_low_raw(["--in-file-index", "--no-in-file-index"]).unwrap(); + assert_eq!(false, args.in_file_index); +} + /// --with-filename #[derive(Debug)] struct WithFilename; diff --git a/crates/core/flags/hiargs.rs b/crates/core/flags/hiargs.rs index 526c91e..eb74819 100644 --- a/crates/core/flags/hiargs.rs +++ b/crates/core/flags/hiargs.rs @@ -61,6 +61,7 @@ pub(crate) struct HiArgs { ignore_file_case_insensitive: bool, ignore_file: Vec, include_zero: bool, + in_file_index: bool, invert_match: bool, is_terminal_stdout: bool, line_number: bool, @@ -73,6 +74,7 @@ pub(crate) struct HiArgs { mode: Mode, multiline: bool, multiline_dotall: bool, + multiline_window: Option, no_ignore_dot: bool, no_ignore_exclude: bool, no_ignore_files: bool, @@ -140,6 +142,9 @@ impl HiArgs { } let mut state = State::new()?; + if low.multiline_window.is_some() && !low.multiline { + anyhow::bail!("--multiline-window requires --multiline"); + } let patterns = Patterns::from_low_args(&mut state, &mut low)?; let paths = Paths::from_low_args(&mut state, &patterns, &mut low)?; @@ -278,6 +283,7 @@ impl HiArgs { ignore_file: low.ignore_file, ignore_file_case_insensitive: low.ignore_file_case_insensitive, include_zero: low.include_zero, + in_file_index: low.in_file_index, invert_match: low.invert_match, is_terminal_stdout: state.is_terminal_stdout, line_number, @@ -289,6 +295,7 @@ impl HiArgs { mmap_choice, multiline: low.multiline, multiline_dotall: low.multiline_dotall, + multiline_window: low.multiline_window, no_ignore_dot: low.no_ignore_dot, no_ignore_exclude: low.no_ignore_exclude, no_ignore_files: low.no_ignore_files, @@ -616,6 +623,7 @@ impl HiArgs { .column(self.column) .heading(self.heading) .hyperlink(self.hyperlink_config.clone()) + .in_file_index(self.in_file_index) .max_columns_preview(self.max_columns_preview) .max_columns(self.max_columns) .only_matching(self.only_matching) @@ -723,6 +731,7 @@ impl HiArgs { .invert_match(self.invert_match) .line_number(self.line_number) .multi_line(self.multiline) + .multiline_window(self.multiline_window) .memory_map(self.mmap_choice.clone()) .stop_on_nonmatch(self.stop_on_nonmatch); match self.context { diff --git a/crates/core/flags/lowargs.rs b/crates/core/flags/lowargs.rs index 1941cae..596206b 100644 --- a/crates/core/flags/lowargs.rs +++ b/crates/core/flags/lowargs.rs @@ -65,6 +65,7 @@ pub(crate) struct LowArgs { pub(crate) ignore_file: Vec, pub(crate) ignore_file_case_insensitive: bool, pub(crate) include_zero: bool, + pub(crate) in_file_index: bool, pub(crate) invert_match: bool, pub(crate) line_number: Option, pub(crate) logging: Option, @@ -76,6 +77,7 @@ pub(crate) struct LowArgs { pub(crate) mmap: MmapMode, pub(crate) multiline: bool, pub(crate) multiline_dotall: bool, + pub(crate) multiline_window: Option, pub(crate) no_config: bool, pub(crate) no_ignore_dot: bool, pub(crate) no_ignore_exclude: bool, diff --git a/crates/printer/src/standard.rs b/crates/printer/src/standard.rs index 35c6e93..6ef590f 100644 --- a/crates/printer/src/standard.rs +++ b/crates/printer/src/standard.rs @@ -39,6 +39,7 @@ struct Config { stats: bool, heading: bool, path: bool, + in_file_index: bool, only_matching: bool, per_match: bool, per_match_one_line: bool, @@ -64,6 +65,7 @@ impl Default for Config { stats: false, heading: false, path: true, + in_file_index: false, only_matching: false, per_match: false, per_match_one_line: false, @@ -231,6 +233,12 @@ impl StandardBuilder { self } + /// When enabled, prefix matching lines with a per-file match index. + pub fn in_file_index(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.in_file_index = yes; + self + } + /// Only print the specific matches instead of the entire line containing /// each match. Each match is printed on its own line. When multi line /// search is enabled, then matches spanning multiple lines are printed @@ -528,6 +536,7 @@ impl Standard { path: None, start_time: Instant::now(), match_count: 0, + in_file_index: 0, binary_byte_offset: None, stats, needs_match_granularity, @@ -564,6 +573,7 @@ impl Standard { path: Some(ppath), start_time: Instant::now(), match_count: 0, + in_file_index: 0, binary_byte_offset: None, stats, needs_match_granularity, @@ -644,6 +654,7 @@ pub struct StandardSink<'p, 's, M: Matcher, W> { path: Option>, start_time: Instant, match_count: u64, + in_file_index: u64, binary_byte_offset: Option, stats: Option, needs_match_granularity: bool, @@ -769,6 +780,7 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { mat: &SinkMatch<'_>, ) -> Result { self.match_count += 1; + self.in_file_index += 1; self.record_matches( searcher, @@ -842,6 +854,7 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { self.standard.wtr.borrow_mut().reset_count(); self.start_time = Instant::now(); self.match_count = 0; + self.in_file_index = 0; self.binary_byte_offset = None; Ok(true) } @@ -956,6 +969,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset(), self.sunk.line_number(), None, + self.in_file_index(), )?; self.write_line(self.sunk.bytes()) } @@ -981,6 +995,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { absolute_byte_offset, self.sunk.line_number().map(|n| n + i as u64), None, + self.in_file_index(), )?; absolute_byte_offset += line.len() as u64; @@ -1001,6 +1016,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset() + m.start() as u64, self.sunk.line_number(), Some(m.start() as u64 + 1), + self.in_file_index(), )?; let buf = &self.sunk.bytes()[m]; @@ -1012,6 +1028,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset() + m.start() as u64, self.sunk.line_number(), Some(m.start() as u64 + 1), + self.in_file_index(), )?; self.write_colored_line(&[m], self.sunk.bytes())?; } @@ -1020,6 +1037,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset(), self.sunk.line_number(), Some(self.sunk.matches()[0].start() as u64 + 1), + self.in_file_index(), )?; self.write_colored_line(self.sunk.matches(), self.sunk.bytes())?; } @@ -1048,6 +1066,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset() + line.start() as u64, self.sunk.line_number().map(|n| n + count), Some(matches[0].start() as u64 + 1), + self.in_file_index(), )?; count += 1; self.trim_ascii_prefix(bytes, &mut line); @@ -1093,6 +1112,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset() + m.start() as u64, self.sunk.line_number().map(|n| n + count), Some(m.start() as u64 + 1), + self.in_file_index(), )?; let this_line = line.with_end(upto); @@ -1131,6 +1151,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.absolute_byte_offset() + line.start() as u64, self.sunk.line_number().map(|n| n + count), Some(m.start().saturating_sub(line.start()) as u64 + 1), + self.in_file_index(), )?; count += 1; self.trim_line_terminator(bytes, &mut line); @@ -1178,10 +1199,11 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { absolute_byte_offset: u64, line_number: Option, column: Option, + in_file_index: Option, ) -> io::Result<()> { let mut prelude = PreludeWriter::new(self); prelude.start(line_number, column)?; - prelude.write_path()?; + prelude.write_path(in_file_index)?; prelude.write_line_number(line_number)?; prelude.write_column_number(column)?; prelude.write_byte_offset(absolute_byte_offset)?; @@ -1532,6 +1554,14 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { self.sunk.context_kind().is_some() } + fn in_file_index(&self) -> Option { + if self.is_context() || !self.config().in_file_index { + None + } else { + Some(self.sink.in_file_index) + } + } + /// Return the underlying configuration for this printer. fn config(&self) -> &'a Config { &self.sink.standard.config @@ -1657,16 +1687,27 @@ impl<'a, M: Matcher, W: WriteColor> PreludeWriter<'a, M, W> { /// separator. (If a path terminator is set, then that is used instead of /// the field separator.) #[inline(always)] - fn write_path(&mut self) -> io::Result<()> { + fn write_path(&mut self, in_file_index: Option) -> io::Result<()> { // The prelude doesn't handle headings, only what comes before a match // on the same line. So if we are emitting paths in headings, we should // not do it here on each line. - if self.config().heading { + if self.config().heading && in_file_index.is_none() { + return Ok(()); + } + let path = self.std.path(); + if path.is_none() && in_file_index.is_none() { return Ok(()); } - let Some(path) = self.std.path() else { return Ok(()) }; self.write_separator()?; - self.std.write_path(path)?; + if let Some(path) = path { + self.std.write_path(path)?; + } + if let Some(index) = in_file_index { + self.std.write_spec(self.config().colors.path(), b"[")?; + let n = DecimalFormatter::new(index); + self.std.write_spec(self.config().colors.path(), n.as_bytes())?; + self.std.write_spec(self.config().colors.path(), b"]")?; + } self.next_separator = if self.config().path_terminator.is_some() { PreludeSeparator::PathTerminator diff --git a/crates/searcher/src/searcher/core.rs b/crates/searcher/src/searcher/core.rs index e0693c0..09eeb77 100644 --- a/crates/searcher/src/searcher/core.rs +++ b/crates/searcher/src/searcher/core.rs @@ -212,6 +212,18 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { consumed } + pub(crate) fn advance_buffer(&mut self, buf: &[u8], consumed: usize) { + if consumed == 0 { + return; + } + self.count_lines(buf, consumed); + self.absolute_byte_offset += consumed as u64; + self.last_line_counted = 0; + self.last_line_visited = + self.last_line_visited.saturating_sub(consumed); + self.set_pos(self.pos().saturating_sub(consumed)); + } + pub(crate) fn detect_binary( &mut self, buf: &[u8], diff --git a/crates/searcher/src/searcher/glue.rs b/crates/searcher/src/searcher/glue.rs index defb9c4..45f2dcd 100644 --- a/crates/searcher/src/searcher/glue.rs +++ b/crates/searcher/src/searcher/glue.rs @@ -1,7 +1,9 @@ use grep_matcher::Matcher; +use std::collections::VecDeque; + use crate::{ - line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader}, + line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader, alloc_error}, lines::{self, LineStep}, searcher::{Config, Range, Searcher, core::Core}, sink::{Sink, SinkError}, @@ -138,6 +140,348 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> { } } +#[derive(Debug)] +pub(crate) struct WindowedMultiLine<'s, M, S> { + config: &'s Config, + core: Core<'s, M, S>, + window_lines: usize, + buf: Vec, + buf_start: usize, + line_lens: VecDeque, + abs_start: u64, + current_index: usize, + eof: bool, +} + +impl<'s, M: Matcher, S: Sink> WindowedMultiLine<'s, M, S> { + pub(crate) fn new( + searcher: &'s Searcher, + matcher: M, + window_lines: usize, + write_to: S, + ) -> WindowedMultiLine<'s, M, S> { + debug_assert!(searcher.multi_line_with_matcher(&matcher)); + debug_assert!(window_lines > 0); + + WindowedMultiLine { + config: &searcher.config, + core: Core::new(searcher, matcher, write_to, true), + window_lines, + buf: Vec::new(), + buf_start: 0, + line_lens: VecDeque::new(), + abs_start: 0, + current_index: 0, + eof: false, + } + } + + pub(crate) fn run_reader( + mut self, + mut rdr: LineBufferReader<'s, R>, + ) -> Result<(), S::Error> { + if self.core.begin()? { + let mut already_binary = rdr.binary_byte_offset().is_some(); + while self.fill_reader(&mut rdr, &mut already_binary)? + || !self.line_lens.is_empty() + { + if !self.process_current_line()? { + break; + } + } + } + let byte_count = self.byte_count(); + let binary_byte_offset = self.core.binary_byte_offset(); + self.core.finish(byte_count, binary_byte_offset) + } + + pub(crate) fn run_slice(mut self, slice: &'s [u8]) -> Result<(), S::Error> { + if self.core.begin()? { + let binary_upto = + std::cmp::min(slice.len(), DEFAULT_BUFFER_CAPACITY); + let binary_range = Range::new(0, binary_upto); + if !self.core.detect_binary(slice, &binary_range)? { + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + 0, + slice.len(), + ); + while let Some(line) = stepper.next_match(slice) { + self.push_line(&slice[line])?; + } + self.eof = true; + while !self.line_lens.is_empty() { + if !self.process_current_line()? { + break; + } + } + } + } + let byte_count = self.byte_count(); + let binary_byte_offset = self.core.binary_byte_offset(); + self.core.finish(byte_count, binary_byte_offset) + } + + fn fill_reader( + &mut self, + rdr: &mut LineBufferReader<'s, R>, + already_binary: &mut bool, + ) -> Result { + while !self.eof + && self.line_lens.len() < self.current_index + self.window_lines + { + let didread = match rdr.fill() { + Err(err) => return Err(S::Error::error_io(err)), + Ok(didread) => didread, + }; + if !*already_binary { + if let Some(offset) = rdr.binary_byte_offset() { + *already_binary = true; + if !self.core.binary_data(offset)? { + self.eof = true; + return Ok(false); + } + } + } + if !didread { + self.eof = true; + break; + } + let buf = rdr.buffer(); + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + 0, + buf.len(), + ); + while let Some(line) = stepper.next_match(buf) { + let bytes = &buf[line]; + self.push_line(bytes)?; + } + rdr.consume(buf.len()); + } + Ok(!self.eof) + } + + fn push_line(&mut self, line: &[u8]) -> Result<(), S::Error> { + self.buf.extend_from_slice(line); + self.line_lens.push_back(line.len()); + if let Some(limit) = self.config.heap_limit { + let used = self.buf.len() - self.buf_start; + if used > limit { + return Err(S::Error::error_io(alloc_error(limit))); + } + } + Ok(()) + } + + fn process_current_line(&mut self) -> Result { + if self.current_index >= self.line_lens.len() { + return Ok(false); + } + let window_end = + std::cmp::min(self.line_lens.len(), self.current_index + self.window_lines); + let window_start_off = self.line_offset(self.current_index); + let window_end_off = self.line_offset(window_end); + let line0_len = self.line_lens[self.current_index]; + + { + let buffer = &self.buf[self.buf_start..]; + let window_bytes = + &self.buf[self.buf_start + window_start_off + ..self.buf_start + window_end_off]; + if self.config.invert_match { + if !sink_inverted_line( + &mut self.core, + self.config, + buffer, + window_bytes, + window_start_off, + line0_len, + )? { + return Ok(false); + } + } else if !sink_matched_line( + &mut self.core, + self.config, + buffer, + window_bytes, + window_start_off, + line0_len, + )? { + return Ok(false); + } + + let drop_upto = window_start_off + line0_len; + if self.config.passthru { + if !self.core.other_context_by_line(buffer, drop_upto)? { + return Ok(false); + } + } else if !self.core.after_context_by_line(buffer, drop_upto)? { + return Ok(false); + } + } + + self.current_index += 1; + if self.current_index > self.config.before_context { + let drop_len = self.line_lens.pop_front().unwrap(); + self.shift_buffer(drop_len); + self.current_index -= 1; + } + + if self.eof && self.current_index >= self.line_lens.len() { + let buffer = &self.buf[self.buf_start..]; + if self.config.passthru { + if !self.core.other_context_by_line(buffer, buffer.len())? { + return Ok(false); + } + } else if !self.core.after_context_by_line(buffer, buffer.len())? { + return Ok(false); + } + return Ok(false); + } + Ok(true) + } + + fn line_offset(&self, idx: usize) -> usize { + self.line_lens.iter().take(idx).sum() + } + + fn shift_buffer(&mut self, consumed: usize) { + let buffer = &self.buf[self.buf_start..]; + self.core.advance_buffer(buffer, consumed); + self.buf_start += consumed; + self.abs_start += consumed as u64; + if self.buf_start > 0 && self.buf_start > self.buf.len() / 2 { + self.buf.copy_within(self.buf_start.., 0); + let new_len = self.buf.len() - self.buf_start; + self.buf.truncate(new_len); + self.buf_start = 0; + } + } + + fn byte_count(&mut self) -> u64 { + match self.core.binary_byte_offset() { + Some(offset) if offset < self.core.pos() as u64 => offset, + _ => self.abs_start + (self.buf.len() - self.buf_start) as u64, + } + } +} + +fn sink_matched_line( + core: &mut Core<'_, M, S>, + config: &Config, + buffer: &[u8], + window_bytes: &[u8], + window_start_off: usize, + line0_len: usize, +) -> Result { + let mut pos = 0; + let mut last_match: Option = None; + while let Some(mat) = find_in_window(core, window_bytes, pos)? { + if mat.start() >= line0_len { + break; + } + let line = lines::locate( + window_bytes, + config.line_term.as_byte(), + mat, + ) + .offset(window_start_off); + match last_match.take() { + None => { + last_match = Some(line); + } + Some(last) => { + if last.end() >= line.start() { + last_match = Some(last.with_end(line.end())); + } else { + if !sink_context(core, config, buffer, &last)? { + return Ok(false); + } + if !core.matched(buffer, &last)? { + return Ok(false); + } + last_match = Some(line); + } + } + } + pos = mat.end(); + if mat.is_empty() && pos < window_bytes.len() { + pos += 1; + } + } + if let Some(last) = last_match.take() { + if !sink_context(core, config, buffer, &last)? { + return Ok(false); + } + if !core.matched(buffer, &last)? { + return Ok(false); + } + } + Ok(true) +} + +fn sink_inverted_line( + core: &mut Core<'_, M, S>, + config: &Config, + buffer: &[u8], + window_bytes: &[u8], + window_start_off: usize, + line0_len: usize, +) -> Result { + let mut pos = 0; + while let Some(mat) = find_in_window(core, window_bytes, pos)? { + if mat.start() >= line0_len { + break; + } + if mat.start() < line0_len { + return Ok(true); + } + pos = mat.end(); + if mat.is_empty() && pos < window_bytes.len() { + pos += 1; + } + } + let line = Range::new(window_start_off, window_start_off + line0_len); + if !sink_context(core, config, buffer, &line)? { + return Ok(false); + } + if !core.matched(buffer, &line)? { + return Ok(false); + } + Ok(true) +} + +fn find_in_window( + core: &mut Core<'_, M, S>, + window_bytes: &[u8], + pos: usize, +) -> Result, S::Error> { + core.find(&window_bytes[pos..]) + .map(|m| m.map(|m| m.offset(pos))) +} + +fn sink_context( + core: &mut Core<'_, M, S>, + config: &Config, + buffer: &[u8], + range: &Range, +) -> Result { + if config.passthru { + if !core.other_context_by_line(buffer, range.start())? { + return Ok(false); + } + } else { + if !core.after_context_by_line(buffer, range.start())? { + return Ok(false); + } + if !core.before_context_by_line(buffer, range.start())? { + return Ok(false); + } + } + Ok(true) +} + #[derive(Debug)] pub(crate) struct MultiLine<'s, M, S> { config: &'s Config, @@ -518,6 +862,37 @@ byte count:366 .test(); } + #[test] + fn multi_line_window_limits_match() { + let haystack = "a\nb\nc\nd\n"; + let matcher = RegexMatcher::new("a\nb\nc"); + + let mut builder = SearcherBuilder::new(); + builder.multi_line(true).multiline_window(Some(2)).line_number(false); + let mut sink = KitchenSink::new(); + let mut searcher = builder.build(); + searcher + .search_slice(&matcher, haystack.as_bytes(), &mut sink) + .unwrap(); + let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap(); + let exp = format!("\nbyte count:{}\n", haystack.len()); + assert_eq!(exp, got); + + let mut builder = SearcherBuilder::new(); + builder.multi_line(true).multiline_window(Some(3)).line_number(false); + let mut sink = KitchenSink::new(); + let mut searcher = builder.build(); + searcher + .search_slice(&matcher, haystack.as_bytes(), &mut sink) + .unwrap(); + let exp = format!( + "0:a\n2:b\n4:c\n\nbyte count:{}\n", + haystack.len() + ); + let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap(); + assert_eq!(exp, got); + } + #[test] fn multi_line_overlap2() { let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx"; diff --git a/crates/searcher/src/searcher/mod.rs b/crates/searcher/src/searcher/mod.rs index 30fbd83..feb4c99 100644 --- a/crates/searcher/src/searcher/mod.rs +++ b/crates/searcher/src/searcher/mod.rs @@ -16,7 +16,7 @@ use crate::{ self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer, LineBufferBuilder, LineBufferReader, alloc_error, }, - searcher::glue::{MultiLine, ReadByLine, SliceByLine}, + searcher::glue::{MultiLine, ReadByLine, SliceByLine, WindowedMultiLine}, sink::{Sink, SinkError}, }; @@ -172,6 +172,8 @@ pub struct Config { binary: BinaryDetection, /// Whether to enable matching across multiple lines. multi_line: bool, + /// The maximum number of lines a multi-line match may span. + multiline_window: Option, /// An encoding that, when present, causes the searcher to transcode all /// input from the encoding to UTF-8. encoding: Option, @@ -197,6 +199,7 @@ impl Default for Config { mmap: MmapChoice::default(), binary: BinaryDetection::default(), multi_line: false, + multiline_window: None, encoding: None, bom_sniffing: true, stop_on_nonmatch: false, @@ -390,6 +393,15 @@ impl SearcherBuilder { self } + /// Limit multi-line matches to a window of at most `line_count` lines. + pub fn multiline_window( + &mut self, + line_count: Option, + ) -> &mut SearcherBuilder { + self.config.multiline_window = line_count; + self + } + /// Whether to include a fixed number of lines after every match. /// /// When this is set to a non-zero number, then the searcher will report @@ -694,6 +706,13 @@ impl Searcher { // enabled. This pre-allocates a buffer roughly the size of the file, // which isn't possible when searching an arbitrary std::io::Read. if self.multi_line_with_matcher(&matcher) { + if self.config.multiline_window.is_some() { + log::trace!( + "{:?}: searching via windowed multiline strategy", + path + ); + return self.search_reader(matcher, file, write_to); + } log::trace!( "{:?}: reading entire file on to heap for mulitline", path @@ -744,6 +763,18 @@ impl Searcher { .map_err(S::Error::error_io)?; if self.multi_line_with_matcher(&matcher) { + if let Some(window_lines) = self.config.multiline_window { + let mut line_buffer = self.line_buffer.borrow_mut(); + let rdr = LineBufferReader::new(decoder, &mut *line_buffer); + log::trace!("generic reader: searching via windowed multiline"); + return WindowedMultiLine::new( + self, + matcher, + window_lines, + write_to, + ) + .run_reader(rdr); + } log::trace!( "generic reader: reading everything to heap for multiline" ); @@ -786,6 +817,16 @@ impl Searcher { return self.search_reader(matcher, slice, write_to); } if self.multi_line_with_matcher(&matcher) { + if let Some(window_lines) = self.config.multiline_window { + log::trace!("slice reader: searching via windowed multiline"); + return WindowedMultiLine::new( + self, + matcher, + window_lines, + write_to, + ) + .run_slice(slice); + } log::trace!("slice reader: searching via multiline strategy"); MultiLine::new(self, matcher, slice, write_to).run() } else { @@ -865,6 +906,12 @@ impl Searcher { self.config.multi_line } + /// Returns the maximum number of lines a multi-line match may span. + #[inline] + pub fn multiline_window(&self) -> Option { + self.config.multiline_window + } + /// Returns true if and only if this searcher is configured to stop when it /// finds a non-matching line after a matching one. #[inline]