rgs: added multiline window limit and in-file result indexing (work in progress)
This commit is contained in:
@@ -30,7 +30,7 @@ rust-version = "1.85"
|
||||
[[bin]]
|
||||
bench = false
|
||||
path = "crates/core/main.rs"
|
||||
name = "rg"
|
||||
name = "rgs"
|
||||
|
||||
[[test]]
|
||||
name = "integration"
|
||||
|
||||
@@ -96,6 +96,8 @@ _rg() {
|
||||
+ '(file-name)' # File-name options
|
||||
{-H,--with-filename}'[show file name for matches]'
|
||||
{-I,--no-filename}"[don't show file name for matches]"
|
||||
'--in-file-index[show per-file match index in output]'
|
||||
'--no-in-file-index[hide per-file match index in output]'
|
||||
|
||||
+ '(file-system)' # File system options
|
||||
"--one-file-system[don't descend into directories on other file systems]"
|
||||
@@ -210,6 +212,7 @@ _rg() {
|
||||
|
||||
+ '(multiline)' # Multiline options
|
||||
{-U,--multiline}'[permit matching across multiple lines]'
|
||||
'--multiline-window=[limit multiline matches to NUM lines (with -U)]:number of lines'
|
||||
$no'(multiline-dotall)--no-multiline[restrict matches to at most one line each]'
|
||||
|
||||
+ '(multiline-dotall)' # Multiline DOTALL options
|
||||
|
||||
@@ -97,6 +97,7 @@ pub(super) const FLAGS: &[&dyn Flag] = &[
|
||||
&MaxFilesize,
|
||||
&Mmap,
|
||||
&Multiline,
|
||||
&MultilineWindow,
|
||||
&MultilineDotall,
|
||||
&NoConfig,
|
||||
&NoIgnore,
|
||||
@@ -142,6 +143,7 @@ pub(super) const FLAGS: &[&dyn Flag] = &[
|
||||
&Unrestricted,
|
||||
&Version,
|
||||
&Vimgrep,
|
||||
&InFileIndex,
|
||||
&WithFilename,
|
||||
&WithFilenameNo,
|
||||
&WordRegexp,
|
||||
@@ -4209,6 +4211,59 @@ fn test_multiline() {
|
||||
assert_eq!(false, args.multiline);
|
||||
}
|
||||
|
||||
/// --multiline-window
|
||||
#[derive(Debug)]
|
||||
struct MultilineWindow;
|
||||
|
||||
impl Flag for MultilineWindow {
|
||||
fn is_switch(&self) -> bool {
|
||||
false
|
||||
}
|
||||
fn name_long(&self) -> &'static str {
|
||||
"multiline-window"
|
||||
}
|
||||
fn doc_variable(&self) -> Option<&'static str> {
|
||||
Some("NUM")
|
||||
}
|
||||
fn doc_category(&self) -> Category {
|
||||
Category::Search
|
||||
}
|
||||
fn doc_short(&self) -> &'static str {
|
||||
r"Limit multiline matches to a fixed number of lines."
|
||||
}
|
||||
fn doc_long(&self) -> &'static str {
|
||||
r#"
|
||||
Limit the maximum number of lines that a multiline match may span to
|
||||
\fINUM\fP (use \fB--multiline-window=\fP\fINUM\fP).
|
||||
.sp
|
||||
This flag requires \flag{multiline}. Matches are found as if the file being
|
||||
searched were limited to \fINUM\fP lines at a time, which can prevent
|
||||
unintended long matches while still enabling multi-line searching.
|
||||
.sp
|
||||
The value of \fINUM\fP must be at least 1.
|
||||
"#
|
||||
}
|
||||
|
||||
fn update(&self, v: FlagValue, args: &mut LowArgs) -> anyhow::Result<()> {
|
||||
let lines = convert::usize(&v.unwrap_value())?;
|
||||
if lines == 0 {
|
||||
anyhow::bail!("--multiline-window must be at least 1");
|
||||
}
|
||||
args.multiline_window = Some(lines);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[test]
|
||||
fn test_multiline_window() {
|
||||
let args = parse_low_raw(None::<&str>).unwrap();
|
||||
assert_eq!(None, args.multiline_window);
|
||||
|
||||
let args = parse_low_raw(["--multiline-window=2"]).unwrap();
|
||||
assert_eq!(Some(2), args.multiline_window);
|
||||
}
|
||||
|
||||
/// --multiline-dotall
|
||||
#[derive(Debug)]
|
||||
struct MultilineDotall;
|
||||
@@ -7401,6 +7456,53 @@ fn test_vimgrep() {
|
||||
assert_eq!(true, args.vimgrep);
|
||||
}
|
||||
|
||||
/// --in-file-index
|
||||
#[derive(Debug)]
|
||||
struct InFileIndex;
|
||||
|
||||
impl Flag for InFileIndex {
|
||||
fn is_switch(&self) -> bool {
|
||||
true
|
||||
}
|
||||
fn name_long(&self) -> &'static str {
|
||||
"in-file-index"
|
||||
}
|
||||
fn name_negated(&self) -> Option<&'static str> {
|
||||
Some("no-in-file-index")
|
||||
}
|
||||
fn doc_category(&self) -> Category {
|
||||
Category::Output
|
||||
}
|
||||
fn doc_short(&self) -> &'static str {
|
||||
r"Prefix matches with an index per file."
|
||||
}
|
||||
fn doc_long(&self) -> &'static str {
|
||||
r"
|
||||
When enabled, ripgrep prefixes each matching line with an index that is
|
||||
incremented per file. The format is \fIFILE\fP[\fIN\fP]:\fILINE\fP:, which can
|
||||
disambiguate multi-line matches that print the same line multiple times.
|
||||
"
|
||||
}
|
||||
|
||||
fn update(&self, v: FlagValue, args: &mut LowArgs) -> anyhow::Result<()> {
|
||||
args.in_file_index = v.unwrap_switch();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[test]
|
||||
fn test_in_file_index() {
|
||||
let args = parse_low_raw(None::<&str>).unwrap();
|
||||
assert_eq!(false, args.in_file_index);
|
||||
|
||||
let args = parse_low_raw(["--in-file-index"]).unwrap();
|
||||
assert_eq!(true, args.in_file_index);
|
||||
|
||||
let args = parse_low_raw(["--in-file-index", "--no-in-file-index"]).unwrap();
|
||||
assert_eq!(false, args.in_file_index);
|
||||
}
|
||||
|
||||
/// --with-filename
|
||||
#[derive(Debug)]
|
||||
struct WithFilename;
|
||||
|
||||
@@ -61,6 +61,7 @@ pub(crate) struct HiArgs {
|
||||
ignore_file_case_insensitive: bool,
|
||||
ignore_file: Vec<PathBuf>,
|
||||
include_zero: bool,
|
||||
in_file_index: bool,
|
||||
invert_match: bool,
|
||||
is_terminal_stdout: bool,
|
||||
line_number: bool,
|
||||
@@ -73,6 +74,7 @@ pub(crate) struct HiArgs {
|
||||
mode: Mode,
|
||||
multiline: bool,
|
||||
multiline_dotall: bool,
|
||||
multiline_window: Option<usize>,
|
||||
no_ignore_dot: bool,
|
||||
no_ignore_exclude: bool,
|
||||
no_ignore_files: bool,
|
||||
@@ -140,6 +142,9 @@ impl HiArgs {
|
||||
}
|
||||
|
||||
let mut state = State::new()?;
|
||||
if low.multiline_window.is_some() && !low.multiline {
|
||||
anyhow::bail!("--multiline-window requires --multiline");
|
||||
}
|
||||
let patterns = Patterns::from_low_args(&mut state, &mut low)?;
|
||||
let paths = Paths::from_low_args(&mut state, &patterns, &mut low)?;
|
||||
|
||||
@@ -278,6 +283,7 @@ impl HiArgs {
|
||||
ignore_file: low.ignore_file,
|
||||
ignore_file_case_insensitive: low.ignore_file_case_insensitive,
|
||||
include_zero: low.include_zero,
|
||||
in_file_index: low.in_file_index,
|
||||
invert_match: low.invert_match,
|
||||
is_terminal_stdout: state.is_terminal_stdout,
|
||||
line_number,
|
||||
@@ -289,6 +295,7 @@ impl HiArgs {
|
||||
mmap_choice,
|
||||
multiline: low.multiline,
|
||||
multiline_dotall: low.multiline_dotall,
|
||||
multiline_window: low.multiline_window,
|
||||
no_ignore_dot: low.no_ignore_dot,
|
||||
no_ignore_exclude: low.no_ignore_exclude,
|
||||
no_ignore_files: low.no_ignore_files,
|
||||
@@ -616,6 +623,7 @@ impl HiArgs {
|
||||
.column(self.column)
|
||||
.heading(self.heading)
|
||||
.hyperlink(self.hyperlink_config.clone())
|
||||
.in_file_index(self.in_file_index)
|
||||
.max_columns_preview(self.max_columns_preview)
|
||||
.max_columns(self.max_columns)
|
||||
.only_matching(self.only_matching)
|
||||
@@ -723,6 +731,7 @@ impl HiArgs {
|
||||
.invert_match(self.invert_match)
|
||||
.line_number(self.line_number)
|
||||
.multi_line(self.multiline)
|
||||
.multiline_window(self.multiline_window)
|
||||
.memory_map(self.mmap_choice.clone())
|
||||
.stop_on_nonmatch(self.stop_on_nonmatch);
|
||||
match self.context {
|
||||
|
||||
@@ -65,6 +65,7 @@ pub(crate) struct LowArgs {
|
||||
pub(crate) ignore_file: Vec<PathBuf>,
|
||||
pub(crate) ignore_file_case_insensitive: bool,
|
||||
pub(crate) include_zero: bool,
|
||||
pub(crate) in_file_index: bool,
|
||||
pub(crate) invert_match: bool,
|
||||
pub(crate) line_number: Option<bool>,
|
||||
pub(crate) logging: Option<LoggingMode>,
|
||||
@@ -76,6 +77,7 @@ pub(crate) struct LowArgs {
|
||||
pub(crate) mmap: MmapMode,
|
||||
pub(crate) multiline: bool,
|
||||
pub(crate) multiline_dotall: bool,
|
||||
pub(crate) multiline_window: Option<usize>,
|
||||
pub(crate) no_config: bool,
|
||||
pub(crate) no_ignore_dot: bool,
|
||||
pub(crate) no_ignore_exclude: bool,
|
||||
|
||||
@@ -39,6 +39,7 @@ struct Config {
|
||||
stats: bool,
|
||||
heading: bool,
|
||||
path: bool,
|
||||
in_file_index: bool,
|
||||
only_matching: bool,
|
||||
per_match: bool,
|
||||
per_match_one_line: bool,
|
||||
@@ -64,6 +65,7 @@ impl Default for Config {
|
||||
stats: false,
|
||||
heading: false,
|
||||
path: true,
|
||||
in_file_index: false,
|
||||
only_matching: false,
|
||||
per_match: false,
|
||||
per_match_one_line: false,
|
||||
@@ -231,6 +233,12 @@ impl StandardBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// When enabled, prefix matching lines with a per-file match index.
|
||||
pub fn in_file_index(&mut self, yes: bool) -> &mut StandardBuilder {
|
||||
self.config.in_file_index = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Only print the specific matches instead of the entire line containing
|
||||
/// each match. Each match is printed on its own line. When multi line
|
||||
/// search is enabled, then matches spanning multiple lines are printed
|
||||
@@ -528,6 +536,7 @@ impl<W: WriteColor> Standard<W> {
|
||||
path: None,
|
||||
start_time: Instant::now(),
|
||||
match_count: 0,
|
||||
in_file_index: 0,
|
||||
binary_byte_offset: None,
|
||||
stats,
|
||||
needs_match_granularity,
|
||||
@@ -564,6 +573,7 @@ impl<W: WriteColor> Standard<W> {
|
||||
path: Some(ppath),
|
||||
start_time: Instant::now(),
|
||||
match_count: 0,
|
||||
in_file_index: 0,
|
||||
binary_byte_offset: None,
|
||||
stats,
|
||||
needs_match_granularity,
|
||||
@@ -644,6 +654,7 @@ pub struct StandardSink<'p, 's, M: Matcher, W> {
|
||||
path: Option<PrinterPath<'p>>,
|
||||
start_time: Instant,
|
||||
match_count: u64,
|
||||
in_file_index: u64,
|
||||
binary_byte_offset: Option<u64>,
|
||||
stats: Option<Stats>,
|
||||
needs_match_granularity: bool,
|
||||
@@ -769,6 +780,7 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> {
|
||||
mat: &SinkMatch<'_>,
|
||||
) -> Result<bool, io::Error> {
|
||||
self.match_count += 1;
|
||||
self.in_file_index += 1;
|
||||
|
||||
self.record_matches(
|
||||
searcher,
|
||||
@@ -842,6 +854,7 @@ impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> {
|
||||
self.standard.wtr.borrow_mut().reset_count();
|
||||
self.start_time = Instant::now();
|
||||
self.match_count = 0;
|
||||
self.in_file_index = 0;
|
||||
self.binary_byte_offset = None;
|
||||
Ok(true)
|
||||
}
|
||||
@@ -956,6 +969,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset(),
|
||||
self.sunk.line_number(),
|
||||
None,
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
self.write_line(self.sunk.bytes())
|
||||
}
|
||||
@@ -981,6 +995,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
absolute_byte_offset,
|
||||
self.sunk.line_number().map(|n| n + i as u64),
|
||||
None,
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
absolute_byte_offset += line.len() as u64;
|
||||
|
||||
@@ -1001,6 +1016,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset() + m.start() as u64,
|
||||
self.sunk.line_number(),
|
||||
Some(m.start() as u64 + 1),
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
|
||||
let buf = &self.sunk.bytes()[m];
|
||||
@@ -1012,6 +1028,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset() + m.start() as u64,
|
||||
self.sunk.line_number(),
|
||||
Some(m.start() as u64 + 1),
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
self.write_colored_line(&[m], self.sunk.bytes())?;
|
||||
}
|
||||
@@ -1020,6 +1037,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset(),
|
||||
self.sunk.line_number(),
|
||||
Some(self.sunk.matches()[0].start() as u64 + 1),
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
self.write_colored_line(self.sunk.matches(), self.sunk.bytes())?;
|
||||
}
|
||||
@@ -1048,6 +1066,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset() + line.start() as u64,
|
||||
self.sunk.line_number().map(|n| n + count),
|
||||
Some(matches[0].start() as u64 + 1),
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
count += 1;
|
||||
self.trim_ascii_prefix(bytes, &mut line);
|
||||
@@ -1093,6 +1112,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset() + m.start() as u64,
|
||||
self.sunk.line_number().map(|n| n + count),
|
||||
Some(m.start() as u64 + 1),
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
|
||||
let this_line = line.with_end(upto);
|
||||
@@ -1131,6 +1151,7 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.absolute_byte_offset() + line.start() as u64,
|
||||
self.sunk.line_number().map(|n| n + count),
|
||||
Some(m.start().saturating_sub(line.start()) as u64 + 1),
|
||||
self.in_file_index(),
|
||||
)?;
|
||||
count += 1;
|
||||
self.trim_line_terminator(bytes, &mut line);
|
||||
@@ -1178,10 +1199,11 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
absolute_byte_offset: u64,
|
||||
line_number: Option<u64>,
|
||||
column: Option<u64>,
|
||||
in_file_index: Option<u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut prelude = PreludeWriter::new(self);
|
||||
prelude.start(line_number, column)?;
|
||||
prelude.write_path()?;
|
||||
prelude.write_path(in_file_index)?;
|
||||
prelude.write_line_number(line_number)?;
|
||||
prelude.write_column_number(column)?;
|
||||
prelude.write_byte_offset(absolute_byte_offset)?;
|
||||
@@ -1532,6 +1554,14 @@ impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> {
|
||||
self.sunk.context_kind().is_some()
|
||||
}
|
||||
|
||||
fn in_file_index(&self) -> Option<u64> {
|
||||
if self.is_context() || !self.config().in_file_index {
|
||||
None
|
||||
} else {
|
||||
Some(self.sink.in_file_index)
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the underlying configuration for this printer.
|
||||
fn config(&self) -> &'a Config {
|
||||
&self.sink.standard.config
|
||||
@@ -1657,16 +1687,27 @@ impl<'a, M: Matcher, W: WriteColor> PreludeWriter<'a, M, W> {
|
||||
/// separator. (If a path terminator is set, then that is used instead of
|
||||
/// the field separator.)
|
||||
#[inline(always)]
|
||||
fn write_path(&mut self) -> io::Result<()> {
|
||||
fn write_path(&mut self, in_file_index: Option<u64>) -> io::Result<()> {
|
||||
// The prelude doesn't handle headings, only what comes before a match
|
||||
// on the same line. So if we are emitting paths in headings, we should
|
||||
// not do it here on each line.
|
||||
if self.config().heading {
|
||||
if self.config().heading && in_file_index.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
let path = self.std.path();
|
||||
if path.is_none() && in_file_index.is_none() {
|
||||
return Ok(());
|
||||
}
|
||||
let Some(path) = self.std.path() else { return Ok(()) };
|
||||
self.write_separator()?;
|
||||
if let Some(path) = path {
|
||||
self.std.write_path(path)?;
|
||||
}
|
||||
if let Some(index) = in_file_index {
|
||||
self.std.write_spec(self.config().colors.path(), b"[")?;
|
||||
let n = DecimalFormatter::new(index);
|
||||
self.std.write_spec(self.config().colors.path(), n.as_bytes())?;
|
||||
self.std.write_spec(self.config().colors.path(), b"]")?;
|
||||
}
|
||||
|
||||
self.next_separator = if self.config().path_terminator.is_some() {
|
||||
PreludeSeparator::PathTerminator
|
||||
|
||||
@@ -212,6 +212,18 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
|
||||
consumed
|
||||
}
|
||||
|
||||
pub(crate) fn advance_buffer(&mut self, buf: &[u8], consumed: usize) {
|
||||
if consumed == 0 {
|
||||
return;
|
||||
}
|
||||
self.count_lines(buf, consumed);
|
||||
self.absolute_byte_offset += consumed as u64;
|
||||
self.last_line_counted = 0;
|
||||
self.last_line_visited =
|
||||
self.last_line_visited.saturating_sub(consumed);
|
||||
self.set_pos(self.pos().saturating_sub(consumed));
|
||||
}
|
||||
|
||||
pub(crate) fn detect_binary(
|
||||
&mut self,
|
||||
buf: &[u8],
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use grep_matcher::Matcher;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use crate::{
|
||||
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader},
|
||||
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader, alloc_error},
|
||||
lines::{self, LineStep},
|
||||
searcher::{Config, Range, Searcher, core::Core},
|
||||
sink::{Sink, SinkError},
|
||||
@@ -138,6 +140,348 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct WindowedMultiLine<'s, M, S> {
|
||||
config: &'s Config,
|
||||
core: Core<'s, M, S>,
|
||||
window_lines: usize,
|
||||
buf: Vec<u8>,
|
||||
buf_start: usize,
|
||||
line_lens: VecDeque<usize>,
|
||||
abs_start: u64,
|
||||
current_index: usize,
|
||||
eof: bool,
|
||||
}
|
||||
|
||||
impl<'s, M: Matcher, S: Sink> WindowedMultiLine<'s, M, S> {
|
||||
pub(crate) fn new(
|
||||
searcher: &'s Searcher,
|
||||
matcher: M,
|
||||
window_lines: usize,
|
||||
write_to: S,
|
||||
) -> WindowedMultiLine<'s, M, S> {
|
||||
debug_assert!(searcher.multi_line_with_matcher(&matcher));
|
||||
debug_assert!(window_lines > 0);
|
||||
|
||||
WindowedMultiLine {
|
||||
config: &searcher.config,
|
||||
core: Core::new(searcher, matcher, write_to, true),
|
||||
window_lines,
|
||||
buf: Vec::new(),
|
||||
buf_start: 0,
|
||||
line_lens: VecDeque::new(),
|
||||
abs_start: 0,
|
||||
current_index: 0,
|
||||
eof: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn run_reader<R: std::io::Read>(
|
||||
mut self,
|
||||
mut rdr: LineBufferReader<'s, R>,
|
||||
) -> Result<(), S::Error> {
|
||||
if self.core.begin()? {
|
||||
let mut already_binary = rdr.binary_byte_offset().is_some();
|
||||
while self.fill_reader(&mut rdr, &mut already_binary)?
|
||||
|| !self.line_lens.is_empty()
|
||||
{
|
||||
if !self.process_current_line()? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
let byte_count = self.byte_count();
|
||||
let binary_byte_offset = self.core.binary_byte_offset();
|
||||
self.core.finish(byte_count, binary_byte_offset)
|
||||
}
|
||||
|
||||
pub(crate) fn run_slice(mut self, slice: &'s [u8]) -> Result<(), S::Error> {
|
||||
if self.core.begin()? {
|
||||
let binary_upto =
|
||||
std::cmp::min(slice.len(), DEFAULT_BUFFER_CAPACITY);
|
||||
let binary_range = Range::new(0, binary_upto);
|
||||
if !self.core.detect_binary(slice, &binary_range)? {
|
||||
let mut stepper = LineStep::new(
|
||||
self.config.line_term.as_byte(),
|
||||
0,
|
||||
slice.len(),
|
||||
);
|
||||
while let Some(line) = stepper.next_match(slice) {
|
||||
self.push_line(&slice[line])?;
|
||||
}
|
||||
self.eof = true;
|
||||
while !self.line_lens.is_empty() {
|
||||
if !self.process_current_line()? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let byte_count = self.byte_count();
|
||||
let binary_byte_offset = self.core.binary_byte_offset();
|
||||
self.core.finish(byte_count, binary_byte_offset)
|
||||
}
|
||||
|
||||
fn fill_reader<R: std::io::Read>(
|
||||
&mut self,
|
||||
rdr: &mut LineBufferReader<'s, R>,
|
||||
already_binary: &mut bool,
|
||||
) -> Result<bool, S::Error> {
|
||||
while !self.eof
|
||||
&& self.line_lens.len() < self.current_index + self.window_lines
|
||||
{
|
||||
let didread = match rdr.fill() {
|
||||
Err(err) => return Err(S::Error::error_io(err)),
|
||||
Ok(didread) => didread,
|
||||
};
|
||||
if !*already_binary {
|
||||
if let Some(offset) = rdr.binary_byte_offset() {
|
||||
*already_binary = true;
|
||||
if !self.core.binary_data(offset)? {
|
||||
self.eof = true;
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !didread {
|
||||
self.eof = true;
|
||||
break;
|
||||
}
|
||||
let buf = rdr.buffer();
|
||||
let mut stepper = LineStep::new(
|
||||
self.config.line_term.as_byte(),
|
||||
0,
|
||||
buf.len(),
|
||||
);
|
||||
while let Some(line) = stepper.next_match(buf) {
|
||||
let bytes = &buf[line];
|
||||
self.push_line(bytes)?;
|
||||
}
|
||||
rdr.consume(buf.len());
|
||||
}
|
||||
Ok(!self.eof)
|
||||
}
|
||||
|
||||
fn push_line(&mut self, line: &[u8]) -> Result<(), S::Error> {
|
||||
self.buf.extend_from_slice(line);
|
||||
self.line_lens.push_back(line.len());
|
||||
if let Some(limit) = self.config.heap_limit {
|
||||
let used = self.buf.len() - self.buf_start;
|
||||
if used > limit {
|
||||
return Err(S::Error::error_io(alloc_error(limit)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_current_line(&mut self) -> Result<bool, S::Error> {
|
||||
if self.current_index >= self.line_lens.len() {
|
||||
return Ok(false);
|
||||
}
|
||||
let window_end =
|
||||
std::cmp::min(self.line_lens.len(), self.current_index + self.window_lines);
|
||||
let window_start_off = self.line_offset(self.current_index);
|
||||
let window_end_off = self.line_offset(window_end);
|
||||
let line0_len = self.line_lens[self.current_index];
|
||||
|
||||
{
|
||||
let buffer = &self.buf[self.buf_start..];
|
||||
let window_bytes =
|
||||
&self.buf[self.buf_start + window_start_off
|
||||
..self.buf_start + window_end_off];
|
||||
if self.config.invert_match {
|
||||
if !sink_inverted_line(
|
||||
&mut self.core,
|
||||
self.config,
|
||||
buffer,
|
||||
window_bytes,
|
||||
window_start_off,
|
||||
line0_len,
|
||||
)? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else if !sink_matched_line(
|
||||
&mut self.core,
|
||||
self.config,
|
||||
buffer,
|
||||
window_bytes,
|
||||
window_start_off,
|
||||
line0_len,
|
||||
)? {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let drop_upto = window_start_off + line0_len;
|
||||
if self.config.passthru {
|
||||
if !self.core.other_context_by_line(buffer, drop_upto)? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else if !self.core.after_context_by_line(buffer, drop_upto)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
|
||||
self.current_index += 1;
|
||||
if self.current_index > self.config.before_context {
|
||||
let drop_len = self.line_lens.pop_front().unwrap();
|
||||
self.shift_buffer(drop_len);
|
||||
self.current_index -= 1;
|
||||
}
|
||||
|
||||
if self.eof && self.current_index >= self.line_lens.len() {
|
||||
let buffer = &self.buf[self.buf_start..];
|
||||
if self.config.passthru {
|
||||
if !self.core.other_context_by_line(buffer, buffer.len())? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else if !self.core.after_context_by_line(buffer, buffer.len())? {
|
||||
return Ok(false);
|
||||
}
|
||||
return Ok(false);
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn line_offset(&self, idx: usize) -> usize {
|
||||
self.line_lens.iter().take(idx).sum()
|
||||
}
|
||||
|
||||
fn shift_buffer(&mut self, consumed: usize) {
|
||||
let buffer = &self.buf[self.buf_start..];
|
||||
self.core.advance_buffer(buffer, consumed);
|
||||
self.buf_start += consumed;
|
||||
self.abs_start += consumed as u64;
|
||||
if self.buf_start > 0 && self.buf_start > self.buf.len() / 2 {
|
||||
self.buf.copy_within(self.buf_start.., 0);
|
||||
let new_len = self.buf.len() - self.buf_start;
|
||||
self.buf.truncate(new_len);
|
||||
self.buf_start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
fn byte_count(&mut self) -> u64 {
|
||||
match self.core.binary_byte_offset() {
|
||||
Some(offset) if offset < self.core.pos() as u64 => offset,
|
||||
_ => self.abs_start + (self.buf.len() - self.buf_start) as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn sink_matched_line<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
config: &Config,
|
||||
buffer: &[u8],
|
||||
window_bytes: &[u8],
|
||||
window_start_off: usize,
|
||||
line0_len: usize,
|
||||
) -> Result<bool, S::Error> {
|
||||
let mut pos = 0;
|
||||
let mut last_match: Option<Range> = None;
|
||||
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
|
||||
if mat.start() >= line0_len {
|
||||
break;
|
||||
}
|
||||
let line = lines::locate(
|
||||
window_bytes,
|
||||
config.line_term.as_byte(),
|
||||
mat,
|
||||
)
|
||||
.offset(window_start_off);
|
||||
match last_match.take() {
|
||||
None => {
|
||||
last_match = Some(line);
|
||||
}
|
||||
Some(last) => {
|
||||
if last.end() >= line.start() {
|
||||
last_match = Some(last.with_end(line.end()));
|
||||
} else {
|
||||
if !sink_context(core, config, buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.matched(buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
last_match = Some(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
pos = mat.end();
|
||||
if mat.is_empty() && pos < window_bytes.len() {
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
if let Some(last) = last_match.take() {
|
||||
if !sink_context(core, config, buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.matched(buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn sink_inverted_line<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
config: &Config,
|
||||
buffer: &[u8],
|
||||
window_bytes: &[u8],
|
||||
window_start_off: usize,
|
||||
line0_len: usize,
|
||||
) -> Result<bool, S::Error> {
|
||||
let mut pos = 0;
|
||||
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
|
||||
if mat.start() >= line0_len {
|
||||
break;
|
||||
}
|
||||
if mat.start() < line0_len {
|
||||
return Ok(true);
|
||||
}
|
||||
pos = mat.end();
|
||||
if mat.is_empty() && pos < window_bytes.len() {
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
let line = Range::new(window_start_off, window_start_off + line0_len);
|
||||
if !sink_context(core, config, buffer, &line)? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.matched(buffer, &line)? {
|
||||
return Ok(false);
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn find_in_window<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
window_bytes: &[u8],
|
||||
pos: usize,
|
||||
) -> Result<Option<Range>, S::Error> {
|
||||
core.find(&window_bytes[pos..])
|
||||
.map(|m| m.map(|m| m.offset(pos)))
|
||||
}
|
||||
|
||||
fn sink_context<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
config: &Config,
|
||||
buffer: &[u8],
|
||||
range: &Range,
|
||||
) -> Result<bool, S::Error> {
|
||||
if config.passthru {
|
||||
if !core.other_context_by_line(buffer, range.start())? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
if !core.after_context_by_line(buffer, range.start())? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.before_context_by_line(buffer, range.start())? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct MultiLine<'s, M, S> {
|
||||
config: &'s Config,
|
||||
@@ -518,6 +862,37 @@ byte count:366
|
||||
.test();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_line_window_limits_match() {
|
||||
let haystack = "a\nb\nc\nd\n";
|
||||
let matcher = RegexMatcher::new("a\nb\nc");
|
||||
|
||||
let mut builder = SearcherBuilder::new();
|
||||
builder.multi_line(true).multiline_window(Some(2)).line_number(false);
|
||||
let mut sink = KitchenSink::new();
|
||||
let mut searcher = builder.build();
|
||||
searcher
|
||||
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
|
||||
.unwrap();
|
||||
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
|
||||
let exp = format!("\nbyte count:{}\n", haystack.len());
|
||||
assert_eq!(exp, got);
|
||||
|
||||
let mut builder = SearcherBuilder::new();
|
||||
builder.multi_line(true).multiline_window(Some(3)).line_number(false);
|
||||
let mut sink = KitchenSink::new();
|
||||
let mut searcher = builder.build();
|
||||
searcher
|
||||
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
|
||||
.unwrap();
|
||||
let exp = format!(
|
||||
"0:a\n2:b\n4:c\n\nbyte count:{}\n",
|
||||
haystack.len()
|
||||
);
|
||||
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
|
||||
assert_eq!(exp, got);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_line_overlap2() {
|
||||
let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx";
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{
|
||||
self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer,
|
||||
LineBufferBuilder, LineBufferReader, alloc_error,
|
||||
},
|
||||
searcher::glue::{MultiLine, ReadByLine, SliceByLine},
|
||||
searcher::glue::{MultiLine, ReadByLine, SliceByLine, WindowedMultiLine},
|
||||
sink::{Sink, SinkError},
|
||||
};
|
||||
|
||||
@@ -172,6 +172,8 @@ pub struct Config {
|
||||
binary: BinaryDetection,
|
||||
/// Whether to enable matching across multiple lines.
|
||||
multi_line: bool,
|
||||
/// The maximum number of lines a multi-line match may span.
|
||||
multiline_window: Option<usize>,
|
||||
/// An encoding that, when present, causes the searcher to transcode all
|
||||
/// input from the encoding to UTF-8.
|
||||
encoding: Option<Encoding>,
|
||||
@@ -197,6 +199,7 @@ impl Default for Config {
|
||||
mmap: MmapChoice::default(),
|
||||
binary: BinaryDetection::default(),
|
||||
multi_line: false,
|
||||
multiline_window: None,
|
||||
encoding: None,
|
||||
bom_sniffing: true,
|
||||
stop_on_nonmatch: false,
|
||||
@@ -390,6 +393,15 @@ impl SearcherBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Limit multi-line matches to a window of at most `line_count` lines.
|
||||
pub fn multiline_window(
|
||||
&mut self,
|
||||
line_count: Option<usize>,
|
||||
) -> &mut SearcherBuilder {
|
||||
self.config.multiline_window = line_count;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to include a fixed number of lines after every match.
|
||||
///
|
||||
/// When this is set to a non-zero number, then the searcher will report
|
||||
@@ -694,6 +706,13 @@ impl Searcher {
|
||||
// enabled. This pre-allocates a buffer roughly the size of the file,
|
||||
// which isn't possible when searching an arbitrary std::io::Read.
|
||||
if self.multi_line_with_matcher(&matcher) {
|
||||
if self.config.multiline_window.is_some() {
|
||||
log::trace!(
|
||||
"{:?}: searching via windowed multiline strategy",
|
||||
path
|
||||
);
|
||||
return self.search_reader(matcher, file, write_to);
|
||||
}
|
||||
log::trace!(
|
||||
"{:?}: reading entire file on to heap for mulitline",
|
||||
path
|
||||
@@ -744,6 +763,18 @@ impl Searcher {
|
||||
.map_err(S::Error::error_io)?;
|
||||
|
||||
if self.multi_line_with_matcher(&matcher) {
|
||||
if let Some(window_lines) = self.config.multiline_window {
|
||||
let mut line_buffer = self.line_buffer.borrow_mut();
|
||||
let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
|
||||
log::trace!("generic reader: searching via windowed multiline");
|
||||
return WindowedMultiLine::new(
|
||||
self,
|
||||
matcher,
|
||||
window_lines,
|
||||
write_to,
|
||||
)
|
||||
.run_reader(rdr);
|
||||
}
|
||||
log::trace!(
|
||||
"generic reader: reading everything to heap for multiline"
|
||||
);
|
||||
@@ -786,6 +817,16 @@ impl Searcher {
|
||||
return self.search_reader(matcher, slice, write_to);
|
||||
}
|
||||
if self.multi_line_with_matcher(&matcher) {
|
||||
if let Some(window_lines) = self.config.multiline_window {
|
||||
log::trace!("slice reader: searching via windowed multiline");
|
||||
return WindowedMultiLine::new(
|
||||
self,
|
||||
matcher,
|
||||
window_lines,
|
||||
write_to,
|
||||
)
|
||||
.run_slice(slice);
|
||||
}
|
||||
log::trace!("slice reader: searching via multiline strategy");
|
||||
MultiLine::new(self, matcher, slice, write_to).run()
|
||||
} else {
|
||||
@@ -865,6 +906,12 @@ impl Searcher {
|
||||
self.config.multi_line
|
||||
}
|
||||
|
||||
/// Returns the maximum number of lines a multi-line match may span.
|
||||
#[inline]
|
||||
pub fn multiline_window(&self) -> Option<usize> {
|
||||
self.config.multiline_window
|
||||
}
|
||||
|
||||
/// Returns true if and only if this searcher is configured to stop when it
|
||||
/// finds a non-matching line after a matching one.
|
||||
#[inline]
|
||||
|
||||
Reference in New Issue
Block a user