rgs: added multiline window limit and in-file result indexing (work in progress)

This commit is contained in:
2025-12-23 04:01:55 -05:00
parent cd1f981bea
commit ad6ec1b4c5
9 changed files with 599 additions and 8 deletions

View File

@@ -30,7 +30,7 @@ rust-version = "1.85"
[[bin]] [[bin]]
bench = false bench = false
path = "crates/core/main.rs" path = "crates/core/main.rs"
name = "rg" name = "rgs"
[[test]] [[test]]
name = "integration" name = "integration"

View File

@@ -96,6 +96,8 @@ _rg() {
+ '(file-name)' # File-name options + '(file-name)' # File-name options
{-H,--with-filename}'[show file name for matches]' {-H,--with-filename}'[show file name for matches]'
{-I,--no-filename}"[don't show file name for matches]" {-I,--no-filename}"[don't show file name for matches]"
'--in-file-index[show per-file match index in output]'
'--no-in-file-index[hide per-file match index in output]'
+ '(file-system)' # File system options + '(file-system)' # File system options
"--one-file-system[don't descend into directories on other file systems]" "--one-file-system[don't descend into directories on other file systems]"
@@ -210,6 +212,7 @@ _rg() {
+ '(multiline)' # Multiline options + '(multiline)' # Multiline options
{-U,--multiline}'[permit matching across multiple lines]' {-U,--multiline}'[permit matching across multiple lines]'
'--multiline-window=[limit multiline matches to NUM lines (with -U)]:number of lines'
$no'(multiline-dotall)--no-multiline[restrict matches to at most one line each]' $no'(multiline-dotall)--no-multiline[restrict matches to at most one line each]'
+ '(multiline-dotall)' # Multiline DOTALL options + '(multiline-dotall)' # Multiline DOTALL options

View File

@@ -97,6 +97,7 @@ pub(super) const FLAGS: &[&dyn Flag] = &[
&MaxFilesize, &MaxFilesize,
&Mmap, &Mmap,
&Multiline, &Multiline,
&MultilineWindow,
&MultilineDotall, &MultilineDotall,
&NoConfig, &NoConfig,
&NoIgnore, &NoIgnore,
@@ -142,6 +143,7 @@ pub(super) const FLAGS: &[&dyn Flag] = &[
&Unrestricted, &Unrestricted,
&Version, &Version,
&Vimgrep, &Vimgrep,
&InFileIndex,
&WithFilename, &WithFilename,
&WithFilenameNo, &WithFilenameNo,
&WordRegexp, &WordRegexp,
@@ -4209,6 +4211,59 @@ fn test_multiline() {
assert_eq!(false, args.multiline); assert_eq!(false, args.multiline);
} }
/// --multiline-window
#[derive(Debug)]
struct MultilineWindow;
impl Flag for MultilineWindow {
fn is_switch(&self) -> bool {
false
}
fn name_long(&self) -> &'static str {
"multiline-window"
}
fn doc_variable(&self) -> Option<&'static str> {
Some("NUM")
}
fn doc_category(&self) -> Category {
Category::Search
}
fn doc_short(&self) -> &'static str {
r"Limit multiline matches to a fixed number of lines."
}
fn doc_long(&self) -> &'static str {
r#"
Limit the maximum number of lines that a multiline match may span to
\fINUM\fP (use \fB--multiline-window=\fP\fINUM\fP).
.sp
This flag requires \flag{multiline}. Matches are found as if the file being
searched were limited to \fINUM\fP lines at a time, which can prevent
unintended long matches while still enabling multi-line searching.
.sp
The value of \fINUM\fP must be at least 1.
"#
}
fn update(&self, v: FlagValue, args: &mut LowArgs) -> anyhow::Result<()> {
let lines = convert::usize(&v.unwrap_value())?;
if lines == 0 {
anyhow::bail!("--multiline-window must be at least 1");
}
args.multiline_window = Some(lines);
Ok(())
}
}
#[cfg(test)]
#[test]
fn test_multiline_window() {
let args = parse_low_raw(None::<&str>).unwrap();
assert_eq!(None, args.multiline_window);
let args = parse_low_raw(["--multiline-window=2"]).unwrap();
assert_eq!(Some(2), args.multiline_window);
}
/// --multiline-dotall /// --multiline-dotall
#[derive(Debug)] #[derive(Debug)]
struct MultilineDotall; struct MultilineDotall;
@@ -7401,6 +7456,53 @@ fn test_vimgrep() {
assert_eq!(true, args.vimgrep); assert_eq!(true, args.vimgrep);
} }
/// --in-file-index
#[derive(Debug)]
struct InFileIndex;
impl Flag for InFileIndex {
fn is_switch(&self) -> bool {
true
}
fn name_long(&self) -> &'static str {
"in-file-index"
}
fn name_negated(&self) -> Option<&'static str> {
Some("no-in-file-index")
}
fn doc_category(&self) -> Category {
Category::Output
}
fn doc_short(&self) -> &'static str {
r"Prefix matches with an index per file."
}
fn doc_long(&self) -> &'static str {
r"
When enabled, ripgrep prefixes each matching line with an index that is
incremented per file. The format is \fIFILE\fP[\fIN\fP]:\fILINE\fP:, which can
disambiguate multi-line matches that print the same line multiple times.
"
}
fn update(&self, v: FlagValue, args: &mut LowArgs) -> anyhow::Result<()> {
args.in_file_index = v.unwrap_switch();
Ok(())
}
}
#[cfg(test)]
#[test]
fn test_in_file_index() {
let args = parse_low_raw(None::<&str>).unwrap();
assert_eq!(false, args.in_file_index);
let args = parse_low_raw(["--in-file-index"]).unwrap();
assert_eq!(true, args.in_file_index);
let args = parse_low_raw(["--in-file-index", "--no-in-file-index"]).unwrap();
assert_eq!(false, args.in_file_index);
}
/// --with-filename /// --with-filename
#[derive(Debug)] #[derive(Debug)]
struct WithFilename; struct WithFilename;

View File

@@ -61,6 +61,7 @@ pub(crate) struct HiArgs {
ignore_file_case_insensitive: bool, ignore_file_case_insensitive: bool,
ignore_file: Vec<PathBuf>, ignore_file: Vec<PathBuf>,
include_zero: bool, include_zero: bool,
in_file_index: bool,
invert_match: bool, invert_match: bool,
is_terminal_stdout: bool, is_terminal_stdout: bool,
line_number: bool, line_number: bool,
@@ -73,6 +74,7 @@ pub(crate) struct HiArgs {
mode: Mode, mode: Mode,
multiline: bool, multiline: bool,
multiline_dotall: bool, multiline_dotall: bool,
multiline_window: Option<usize>,
no_ignore_dot: bool, no_ignore_dot: bool,
no_ignore_exclude: bool, no_ignore_exclude: bool,
no_ignore_files: bool, no_ignore_files: bool,
@@ -140,6 +142,9 @@ impl HiArgs {
} }
let mut state = State::new()?; let mut state = State::new()?;
if low.multiline_window.is_some() && !low.multiline {
anyhow::bail!("--multiline-window requires --multiline");
}
let patterns = Patterns::from_low_args(&mut state, &mut low)?; let patterns = Patterns::from_low_args(&mut state, &mut low)?;
let paths = Paths::from_low_args(&mut state, &patterns, &mut low)?; let paths = Paths::from_low_args(&mut state, &patterns, &mut low)?;
@@ -278,6 +283,7 @@ impl HiArgs {
ignore_file: low.ignore_file, ignore_file: low.ignore_file,
ignore_file_case_insensitive: low.ignore_file_case_insensitive, ignore_file_case_insensitive: low.ignore_file_case_insensitive,
include_zero: low.include_zero, include_zero: low.include_zero,
in_file_index: low.in_file_index,
invert_match: low.invert_match, invert_match: low.invert_match,
is_terminal_stdout: state.is_terminal_stdout, is_terminal_stdout: state.is_terminal_stdout,
line_number, line_number,
@@ -289,6 +295,7 @@ impl HiArgs {
mmap_choice, mmap_choice,
multiline: low.multiline, multiline: low.multiline,
multiline_dotall: low.multiline_dotall, multiline_dotall: low.multiline_dotall,
multiline_window: low.multiline_window,
no_ignore_dot: low.no_ignore_dot, no_ignore_dot: low.no_ignore_dot,
no_ignore_exclude: low.no_ignore_exclude, no_ignore_exclude: low.no_ignore_exclude,
no_ignore_files: low.no_ignore_files, no_ignore_files: low.no_ignore_files,
@@ -616,6 +623,7 @@ impl HiArgs {
.column(self.column) .column(self.column)
.heading(self.heading) .heading(self.heading)
.hyperlink(self.hyperlink_config.clone()) .hyperlink(self.hyperlink_config.clone())
.in_file_index(self.in_file_index)
.max_columns_preview(self.max_columns_preview) .max_columns_preview(self.max_columns_preview)
.max_columns(self.max_columns) .max_columns(self.max_columns)
.only_matching(self.only_matching) .only_matching(self.only_matching)
@@ -723,6 +731,7 @@ impl HiArgs {
.invert_match(self.invert_match) .invert_match(self.invert_match)
.line_number(self.line_number) .line_number(self.line_number)
.multi_line(self.multiline) .multi_line(self.multiline)
.multiline_window(self.multiline_window)
.memory_map(self.mmap_choice.clone()) .memory_map(self.mmap_choice.clone())
.stop_on_nonmatch(self.stop_on_nonmatch); .stop_on_nonmatch(self.stop_on_nonmatch);
match self.context { match self.context {

View File

@@ -65,6 +65,7 @@ pub(crate) struct LowArgs {
pub(crate) ignore_file: Vec<PathBuf>, pub(crate) ignore_file: Vec<PathBuf>,
pub(crate) ignore_file_case_insensitive: bool, pub(crate) ignore_file_case_insensitive: bool,
pub(crate) include_zero: bool, pub(crate) include_zero: bool,
pub(crate) in_file_index: bool,
pub(crate) invert_match: bool, pub(crate) invert_match: bool,
pub(crate) line_number: Option<bool>, pub(crate) line_number: Option<bool>,
pub(crate) logging: Option<LoggingMode>, pub(crate) logging: Option<LoggingMode>,
@@ -76,6 +77,7 @@ pub(crate) struct LowArgs {
pub(crate) mmap: MmapMode, pub(crate) mmap: MmapMode,
pub(crate) multiline: bool, pub(crate) multiline: bool,
pub(crate) multiline_dotall: bool, pub(crate) multiline_dotall: bool,
pub(crate) multiline_window: Option<usize>,
pub(crate) no_config: bool, pub(crate) no_config: bool,
pub(crate) no_ignore_dot: bool, pub(crate) no_ignore_dot: bool,
pub(crate) no_ignore_exclude: bool, pub(crate) no_ignore_exclude: bool,

View File

@@ -39,6 +39,7 @@ struct Config {
stats: bool, stats: bool,
heading: bool, heading: bool,
path: bool, path: bool,
in_file_index: bool,
only_matching: bool, only_matching: bool,
per_match: bool, per_match: bool,
per_match_one_line: bool, per_match_one_line: bool,
@@ -64,6 +65,7 @@ impl Default for Config {
stats: false, stats: false,
heading: false, heading: false,
path: true, path: true,
in_file_index: false,
only_matching: false, only_matching: false,
per_match: false, per_match: false,
per_match_one_line: false, per_match_one_line: false,
@@ -231,6 +233,12 @@ impl StandardBuilder {
self self
} }
/// When enabled, prefix matching lines with a per-file match index.
pub fn in_file_index(&mut self, yes: bool) -> &mut StandardBuilder {
self.config.in_file_index = yes;
self
}
/// Only print the specific matches instead of the entire line containing /// Only print the specific matches instead of the entire line containing
/// each match. Each match is printed on its own line. When multi line /// each match. Each match is printed on its own line. When multi line
/// search is enabled, then matches spanning multiple lines are printed /// search is enabled, then matches spanning multiple lines are printed
@@ -528,6 +536,7 @@ impl<W: WriteColor> Standard<W> {
path: None, path: None,
start_time: Instant::now(), start_time: Instant::now(),
match_count: 0, match_count: 0,
in_file_index: 0,
binary_byte_offset: None, binary_byte_offset: None,
stats, stats,
needs_match_granularity, needs_match_granularity,
@@ -564,6 +573,7 @@ impl<W: WriteColor> Standard<W> {
path: Some(ppath), path: Some(ppath),
start_time: Instant::now(), start_time: Instant::now(),
match_count: 0, match_count: 0,
in_file_index: 0,
binary_byte_offset: None, binary_byte_offset: None,
stats, stats,
needs_match_granularity, needs_match_granularity,
@@ -644,6 +654,7 @@ pub struct StandardSink<'p, 's, M: Matcher, W> {
path: Option<PrinterPath<'p>>, path: Option<PrinterPath<'p>>,
start_time: Instant, start_time: Instant,
match_count: u64, match_count: u64,
in_file_index: u64,
binary_byte_offset: Option<u64>, binary_byte_offset: Option<u64>,
stats: Option<Stats>, stats: Option<Stats>,
needs_match_granularity: bool, needs_match_granularity: bool,
@@ -769,6 +780,7 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> {
mat: &SinkMatch<'_>, mat: &SinkMatch<'_>,
) -> Result<bool, io::Error> { ) -> Result<bool, io::Error> {
self.match_count += 1; self.match_count += 1;
self.in_file_index += 1;
self.record_matches( self.record_matches(
searcher, searcher,
@@ -842,6 +854,7 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> {
self.standard.wtr.borrow_mut().reset_count(); self.standard.wtr.borrow_mut().reset_count();
self.start_time = Instant::now(); self.start_time = Instant::now();
self.match_count = 0; self.match_count = 0;
self.in_file_index = 0;
self.binary_byte_offset = None; self.binary_byte_offset = None;
Ok(true) Ok(true)
} }
@@ -956,6 +969,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset(), self.sunk.absolute_byte_offset(),
self.sunk.line_number(), self.sunk.line_number(),
None, None,
self.in_file_index(),
)?; )?;
self.write_line(self.sunk.bytes()) self.write_line(self.sunk.bytes())
} }
@@ -981,6 +995,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
absolute_byte_offset, absolute_byte_offset,
self.sunk.line_number().map(|n| n + i as u64), self.sunk.line_number().map(|n| n + i as u64),
None, None,
self.in_file_index(),
)?; )?;
absolute_byte_offset += line.len() as u64; absolute_byte_offset += line.len() as u64;
@@ -1001,6 +1016,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset() + m.start() as u64, self.sunk.absolute_byte_offset() + m.start() as u64,
self.sunk.line_number(), self.sunk.line_number(),
Some(m.start() as u64 + 1), Some(m.start() as u64 + 1),
self.in_file_index(),
)?; )?;
let buf = &self.sunk.bytes()[m]; let buf = &self.sunk.bytes()[m];
@@ -1012,6 +1028,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset() + m.start() as u64, self.sunk.absolute_byte_offset() + m.start() as u64,
self.sunk.line_number(), self.sunk.line_number(),
Some(m.start() as u64 + 1), Some(m.start() as u64 + 1),
self.in_file_index(),
)?; )?;
self.write_colored_line(&[m], self.sunk.bytes())?; self.write_colored_line(&[m], self.sunk.bytes())?;
} }
@@ -1020,6 +1037,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset(), self.sunk.absolute_byte_offset(),
self.sunk.line_number(), self.sunk.line_number(),
Some(self.sunk.matches()[0].start() as u64 + 1), Some(self.sunk.matches()[0].start() as u64 + 1),
self.in_file_index(),
)?; )?;
self.write_colored_line(self.sunk.matches(), self.sunk.bytes())?; self.write_colored_line(self.sunk.matches(), self.sunk.bytes())?;
} }
@@ -1048,6 +1066,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset() + line.start() as u64, self.sunk.absolute_byte_offset() + line.start() as u64,
self.sunk.line_number().map(|n| n + count), self.sunk.line_number().map(|n| n + count),
Some(matches[0].start() as u64 + 1), Some(matches[0].start() as u64 + 1),
self.in_file_index(),
)?; )?;
count += 1; count += 1;
self.trim_ascii_prefix(bytes, &mut line); self.trim_ascii_prefix(bytes, &mut line);
@@ -1093,6 +1112,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset() + m.start() as u64, self.sunk.absolute_byte_offset() + m.start() as u64,
self.sunk.line_number().map(|n| n + count), self.sunk.line_number().map(|n| n + count),
Some(m.start() as u64 + 1), Some(m.start() as u64 + 1),
self.in_file_index(),
)?; )?;
let this_line = line.with_end(upto); let this_line = line.with_end(upto);
@@ -1131,6 +1151,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.absolute_byte_offset() + line.start() as u64, self.sunk.absolute_byte_offset() + line.start() as u64,
self.sunk.line_number().map(|n| n + count), self.sunk.line_number().map(|n| n + count),
Some(m.start().saturating_sub(line.start()) as u64 + 1), Some(m.start().saturating_sub(line.start()) as u64 + 1),
self.in_file_index(),
)?; )?;
count += 1; count += 1;
self.trim_line_terminator(bytes, &mut line); self.trim_line_terminator(bytes, &mut line);
@@ -1178,10 +1199,11 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
absolute_byte_offset: u64, absolute_byte_offset: u64,
line_number: Option<u64>, line_number: Option<u64>,
column: Option<u64>, column: Option<u64>,
in_file_index: Option<u64>,
) -> io::Result<()> { ) -> io::Result<()> {
let mut prelude = PreludeWriter::new(self); let mut prelude = PreludeWriter::new(self);
prelude.start(line_number, column)?; prelude.start(line_number, column)?;
prelude.write_path()?; prelude.write_path(in_file_index)?;
prelude.write_line_number(line_number)?; prelude.write_line_number(line_number)?;
prelude.write_column_number(column)?; prelude.write_column_number(column)?;
prelude.write_byte_offset(absolute_byte_offset)?; prelude.write_byte_offset(absolute_byte_offset)?;
@@ -1532,6 +1554,14 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
self.sunk.context_kind().is_some() self.sunk.context_kind().is_some()
} }
fn in_file_index(&self) -> Option<u64> {
if self.is_context() || !self.config().in_file_index {
None
} else {
Some(self.sink.in_file_index)
}
}
/// Return the underlying configuration for this printer. /// Return the underlying configuration for this printer.
fn config(&self) -> &'a Config { fn config(&self) -> &'a Config {
&self.sink.standard.config &self.sink.standard.config
@@ -1657,16 +1687,27 @@ impl<'a, M: Matcher, W: WriteColor> PreludeWriter<'a, M, W> {
/// separator. (If a path terminator is set, then that is used instead of /// separator. (If a path terminator is set, then that is used instead of
/// the field separator.) /// the field separator.)
#[inline(always)] #[inline(always)]
fn write_path(&mut self) -> io::Result<()> { fn write_path(&mut self, in_file_index: Option<u64>) -> io::Result<()> {
// The prelude doesn't handle headings, only what comes before a match // The prelude doesn't handle headings, only what comes before a match
// on the same line. So if we are emitting paths in headings, we should // on the same line. So if we are emitting paths in headings, we should
// not do it here on each line. // not do it here on each line.
if self.config().heading { if self.config().heading && in_file_index.is_none() {
return Ok(());
}
let path = self.std.path();
if path.is_none() && in_file_index.is_none() {
return Ok(()); return Ok(());
} }
let Some(path) = self.std.path() else { return Ok(()) };
self.write_separator()?; self.write_separator()?;
if let Some(path) = path {
self.std.write_path(path)?; self.std.write_path(path)?;
}
if let Some(index) = in_file_index {
self.std.write_spec(self.config().colors.path(), b"[")?;
let n = DecimalFormatter::new(index);
self.std.write_spec(self.config().colors.path(), n.as_bytes())?;
self.std.write_spec(self.config().colors.path(), b"]")?;
}
self.next_separator = if self.config().path_terminator.is_some() { self.next_separator = if self.config().path_terminator.is_some() {
PreludeSeparator::PathTerminator PreludeSeparator::PathTerminator

View File

@@ -212,6 +212,18 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
consumed consumed
} }
pub(crate) fn advance_buffer(&mut self, buf: &[u8], consumed: usize) {
if consumed == 0 {
return;
}
self.count_lines(buf, consumed);
self.absolute_byte_offset += consumed as u64;
self.last_line_counted = 0;
self.last_line_visited =
self.last_line_visited.saturating_sub(consumed);
self.set_pos(self.pos().saturating_sub(consumed));
}
pub(crate) fn detect_binary( pub(crate) fn detect_binary(
&mut self, &mut self,
buf: &[u8], buf: &[u8],

View File

@@ -1,7 +1,9 @@
use grep_matcher::Matcher; use grep_matcher::Matcher;
use std::collections::VecDeque;
use crate::{ use crate::{
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader}, line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader, alloc_error},
lines::{self, LineStep}, lines::{self, LineStep},
searcher::{Config, Range, Searcher, core::Core}, searcher::{Config, Range, Searcher, core::Core},
sink::{Sink, SinkError}, sink::{Sink, SinkError},
@@ -138,6 +140,348 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> {
} }
} }
#[derive(Debug)]
pub(crate) struct WindowedMultiLine<'s, M, S> {
config: &'s Config,
core: Core<'s, M, S>,
window_lines: usize,
buf: Vec<u8>,
buf_start: usize,
line_lens: VecDeque<usize>,
abs_start: u64,
current_index: usize,
eof: bool,
}
impl<'s, M: Matcher, S: Sink> WindowedMultiLine<'s, M, S> {
pub(crate) fn new(
searcher: &'s Searcher,
matcher: M,
window_lines: usize,
write_to: S,
) -> WindowedMultiLine<'s, M, S> {
debug_assert!(searcher.multi_line_with_matcher(&matcher));
debug_assert!(window_lines > 0);
WindowedMultiLine {
config: &searcher.config,
core: Core::new(searcher, matcher, write_to, true),
window_lines,
buf: Vec::new(),
buf_start: 0,
line_lens: VecDeque::new(),
abs_start: 0,
current_index: 0,
eof: false,
}
}
pub(crate) fn run_reader<R: std::io::Read>(
mut self,
mut rdr: LineBufferReader<'s, R>,
) -> Result<(), S::Error> {
if self.core.begin()? {
let mut already_binary = rdr.binary_byte_offset().is_some();
while self.fill_reader(&mut rdr, &mut already_binary)?
|| !self.line_lens.is_empty()
{
if !self.process_current_line()? {
break;
}
}
}
let byte_count = self.byte_count();
let binary_byte_offset = self.core.binary_byte_offset();
self.core.finish(byte_count, binary_byte_offset)
}
pub(crate) fn run_slice(mut self, slice: &'s [u8]) -> Result<(), S::Error> {
if self.core.begin()? {
let binary_upto =
std::cmp::min(slice.len(), DEFAULT_BUFFER_CAPACITY);
let binary_range = Range::new(0, binary_upto);
if !self.core.detect_binary(slice, &binary_range)? {
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
0,
slice.len(),
);
while let Some(line) = stepper.next_match(slice) {
self.push_line(&slice[line])?;
}
self.eof = true;
while !self.line_lens.is_empty() {
if !self.process_current_line()? {
break;
}
}
}
}
let byte_count = self.byte_count();
let binary_byte_offset = self.core.binary_byte_offset();
self.core.finish(byte_count, binary_byte_offset)
}
fn fill_reader<R: std::io::Read>(
&mut self,
rdr: &mut LineBufferReader<'s, R>,
already_binary: &mut bool,
) -> Result<bool, S::Error> {
while !self.eof
&& self.line_lens.len() < self.current_index + self.window_lines
{
let didread = match rdr.fill() {
Err(err) => return Err(S::Error::error_io(err)),
Ok(didread) => didread,
};
if !*already_binary {
if let Some(offset) = rdr.binary_byte_offset() {
*already_binary = true;
if !self.core.binary_data(offset)? {
self.eof = true;
return Ok(false);
}
}
}
if !didread {
self.eof = true;
break;
}
let buf = rdr.buffer();
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
0,
buf.len(),
);
while let Some(line) = stepper.next_match(buf) {
let bytes = &buf[line];
self.push_line(bytes)?;
}
rdr.consume(buf.len());
}
Ok(!self.eof)
}
fn push_line(&mut self, line: &[u8]) -> Result<(), S::Error> {
self.buf.extend_from_slice(line);
self.line_lens.push_back(line.len());
if let Some(limit) = self.config.heap_limit {
let used = self.buf.len() - self.buf_start;
if used > limit {
return Err(S::Error::error_io(alloc_error(limit)));
}
}
Ok(())
}
fn process_current_line(&mut self) -> Result<bool, S::Error> {
if self.current_index >= self.line_lens.len() {
return Ok(false);
}
let window_end =
std::cmp::min(self.line_lens.len(), self.current_index + self.window_lines);
let window_start_off = self.line_offset(self.current_index);
let window_end_off = self.line_offset(window_end);
let line0_len = self.line_lens[self.current_index];
{
let buffer = &self.buf[self.buf_start..];
let window_bytes =
&self.buf[self.buf_start + window_start_off
..self.buf_start + window_end_off];
if self.config.invert_match {
if !sink_inverted_line(
&mut self.core,
self.config,
buffer,
window_bytes,
window_start_off,
line0_len,
)? {
return Ok(false);
}
} else if !sink_matched_line(
&mut self.core,
self.config,
buffer,
window_bytes,
window_start_off,
line0_len,
)? {
return Ok(false);
}
let drop_upto = window_start_off + line0_len;
if self.config.passthru {
if !self.core.other_context_by_line(buffer, drop_upto)? {
return Ok(false);
}
} else if !self.core.after_context_by_line(buffer, drop_upto)? {
return Ok(false);
}
}
self.current_index += 1;
if self.current_index > self.config.before_context {
let drop_len = self.line_lens.pop_front().unwrap();
self.shift_buffer(drop_len);
self.current_index -= 1;
}
if self.eof && self.current_index >= self.line_lens.len() {
let buffer = &self.buf[self.buf_start..];
if self.config.passthru {
if !self.core.other_context_by_line(buffer, buffer.len())? {
return Ok(false);
}
} else if !self.core.after_context_by_line(buffer, buffer.len())? {
return Ok(false);
}
return Ok(false);
}
Ok(true)
}
fn line_offset(&self, idx: usize) -> usize {
self.line_lens.iter().take(idx).sum()
}
fn shift_buffer(&mut self, consumed: usize) {
let buffer = &self.buf[self.buf_start..];
self.core.advance_buffer(buffer, consumed);
self.buf_start += consumed;
self.abs_start += consumed as u64;
if self.buf_start > 0 && self.buf_start > self.buf.len() / 2 {
self.buf.copy_within(self.buf_start.., 0);
let new_len = self.buf.len() - self.buf_start;
self.buf.truncate(new_len);
self.buf_start = 0;
}
}
fn byte_count(&mut self) -> u64 {
match self.core.binary_byte_offset() {
Some(offset) if offset < self.core.pos() as u64 => offset,
_ => self.abs_start + (self.buf.len() - self.buf_start) as u64,
}
}
}
fn sink_matched_line<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
config: &Config,
buffer: &[u8],
window_bytes: &[u8],
window_start_off: usize,
line0_len: usize,
) -> Result<bool, S::Error> {
let mut pos = 0;
let mut last_match: Option<Range> = None;
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
if mat.start() >= line0_len {
break;
}
let line = lines::locate(
window_bytes,
config.line_term.as_byte(),
mat,
)
.offset(window_start_off);
match last_match.take() {
None => {
last_match = Some(line);
}
Some(last) => {
if last.end() >= line.start() {
last_match = Some(last.with_end(line.end()));
} else {
if !sink_context(core, config, buffer, &last)? {
return Ok(false);
}
if !core.matched(buffer, &last)? {
return Ok(false);
}
last_match = Some(line);
}
}
}
pos = mat.end();
if mat.is_empty() && pos < window_bytes.len() {
pos += 1;
}
}
if let Some(last) = last_match.take() {
if !sink_context(core, config, buffer, &last)? {
return Ok(false);
}
if !core.matched(buffer, &last)? {
return Ok(false);
}
}
Ok(true)
}
fn sink_inverted_line<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
config: &Config,
buffer: &[u8],
window_bytes: &[u8],
window_start_off: usize,
line0_len: usize,
) -> Result<bool, S::Error> {
let mut pos = 0;
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
if mat.start() >= line0_len {
break;
}
if mat.start() < line0_len {
return Ok(true);
}
pos = mat.end();
if mat.is_empty() && pos < window_bytes.len() {
pos += 1;
}
}
let line = Range::new(window_start_off, window_start_off + line0_len);
if !sink_context(core, config, buffer, &line)? {
return Ok(false);
}
if !core.matched(buffer, &line)? {
return Ok(false);
}
Ok(true)
}
fn find_in_window<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
window_bytes: &[u8],
pos: usize,
) -> Result<Option<Range>, S::Error> {
core.find(&window_bytes[pos..])
.map(|m| m.map(|m| m.offset(pos)))
}
fn sink_context<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
config: &Config,
buffer: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
if config.passthru {
if !core.other_context_by_line(buffer, range.start())? {
return Ok(false);
}
} else {
if !core.after_context_by_line(buffer, range.start())? {
return Ok(false);
}
if !core.before_context_by_line(buffer, range.start())? {
return Ok(false);
}
}
Ok(true)
}
#[derive(Debug)] #[derive(Debug)]
pub(crate) struct MultiLine<'s, M, S> { pub(crate) struct MultiLine<'s, M, S> {
config: &'s Config, config: &'s Config,
@@ -518,6 +862,37 @@ byte count:366
.test(); .test();
} }
#[test]
fn multi_line_window_limits_match() {
let haystack = "a\nb\nc\nd\n";
let matcher = RegexMatcher::new("a\nb\nc");
let mut builder = SearcherBuilder::new();
builder.multi_line(true).multiline_window(Some(2)).line_number(false);
let mut sink = KitchenSink::new();
let mut searcher = builder.build();
searcher
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
.unwrap();
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
let exp = format!("\nbyte count:{}\n", haystack.len());
assert_eq!(exp, got);
let mut builder = SearcherBuilder::new();
builder.multi_line(true).multiline_window(Some(3)).line_number(false);
let mut sink = KitchenSink::new();
let mut searcher = builder.build();
searcher
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
.unwrap();
let exp = format!(
"0:a\n2:b\n4:c\n\nbyte count:{}\n",
haystack.len()
);
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
assert_eq!(exp, got);
}
#[test] #[test]
fn multi_line_overlap2() { fn multi_line_overlap2() {
let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx"; let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx";

View File

@@ -16,7 +16,7 @@ use crate::{
self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer, self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer,
LineBufferBuilder, LineBufferReader, alloc_error, LineBufferBuilder, LineBufferReader, alloc_error,
}, },
searcher::glue::{MultiLine, ReadByLine, SliceByLine}, searcher::glue::{MultiLine, ReadByLine, SliceByLine, WindowedMultiLine},
sink::{Sink, SinkError}, sink::{Sink, SinkError},
}; };
@@ -172,6 +172,8 @@ pub struct Config {
binary: BinaryDetection, binary: BinaryDetection,
/// Whether to enable matching across multiple lines. /// Whether to enable matching across multiple lines.
multi_line: bool, multi_line: bool,
/// The maximum number of lines a multi-line match may span.
multiline_window: Option<usize>,
/// An encoding that, when present, causes the searcher to transcode all /// An encoding that, when present, causes the searcher to transcode all
/// input from the encoding to UTF-8. /// input from the encoding to UTF-8.
encoding: Option<Encoding>, encoding: Option<Encoding>,
@@ -197,6 +199,7 @@ impl Default for Config {
mmap: MmapChoice::default(), mmap: MmapChoice::default(),
binary: BinaryDetection::default(), binary: BinaryDetection::default(),
multi_line: false, multi_line: false,
multiline_window: None,
encoding: None, encoding: None,
bom_sniffing: true, bom_sniffing: true,
stop_on_nonmatch: false, stop_on_nonmatch: false,
@@ -390,6 +393,15 @@ impl SearcherBuilder {
self self
} }
/// Limit multi-line matches to a window of at most `line_count` lines.
pub fn multiline_window(
&mut self,
line_count: Option<usize>,
) -> &mut SearcherBuilder {
self.config.multiline_window = line_count;
self
}
/// Whether to include a fixed number of lines after every match. /// Whether to include a fixed number of lines after every match.
/// ///
/// When this is set to a non-zero number, then the searcher will report /// When this is set to a non-zero number, then the searcher will report
@@ -694,6 +706,13 @@ impl Searcher {
// enabled. This pre-allocates a buffer roughly the size of the file, // enabled. This pre-allocates a buffer roughly the size of the file,
// which isn't possible when searching an arbitrary std::io::Read. // which isn't possible when searching an arbitrary std::io::Read.
if self.multi_line_with_matcher(&matcher) { if self.multi_line_with_matcher(&matcher) {
if self.config.multiline_window.is_some() {
log::trace!(
"{:?}: searching via windowed multiline strategy",
path
);
return self.search_reader(matcher, file, write_to);
}
log::trace!( log::trace!(
"{:?}: reading entire file on to heap for mulitline", "{:?}: reading entire file on to heap for mulitline",
path path
@@ -744,6 +763,18 @@ impl Searcher {
.map_err(S::Error::error_io)?; .map_err(S::Error::error_io)?;
if self.multi_line_with_matcher(&matcher) { if self.multi_line_with_matcher(&matcher) {
if let Some(window_lines) = self.config.multiline_window {
let mut line_buffer = self.line_buffer.borrow_mut();
let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
log::trace!("generic reader: searching via windowed multiline");
return WindowedMultiLine::new(
self,
matcher,
window_lines,
write_to,
)
.run_reader(rdr);
}
log::trace!( log::trace!(
"generic reader: reading everything to heap for multiline" "generic reader: reading everything to heap for multiline"
); );
@@ -786,6 +817,16 @@ impl Searcher {
return self.search_reader(matcher, slice, write_to); return self.search_reader(matcher, slice, write_to);
} }
if self.multi_line_with_matcher(&matcher) { if self.multi_line_with_matcher(&matcher) {
if let Some(window_lines) = self.config.multiline_window {
log::trace!("slice reader: searching via windowed multiline");
return WindowedMultiLine::new(
self,
matcher,
window_lines,
write_to,
)
.run_slice(slice);
}
log::trace!("slice reader: searching via multiline strategy"); log::trace!("slice reader: searching via multiline strategy");
MultiLine::new(self, matcher, slice, write_to).run() MultiLine::new(self, matcher, slice, write_to).run()
} else { } else {
@@ -865,6 +906,12 @@ impl Searcher {
self.config.multi_line self.config.multi_line
} }
/// Returns the maximum number of lines a multi-line match may span.
#[inline]
pub fn multiline_window(&self) -> Option<usize> {
self.config.multiline_window
}
/// Returns true if and only if this searcher is configured to stop when it /// Returns true if and only if this searcher is configured to stop when it
/// finds a non-matching line after a matching one. /// finds a non-matching line after a matching one.
#[inline] #[inline]