rgs: added multiline window limit and in-file result indexing (work in progress)

This commit is contained in:
2025-12-23 04:01:55 -05:00
parent cd1f981bea
commit ad6ec1b4c5
9 changed files with 599 additions and 8 deletions

View File

@@ -212,6 +212,18 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
consumed
}
pub(crate) fn advance_buffer(&mut self, buf: &[u8], consumed: usize) {
if consumed == 0 {
return;
}
self.count_lines(buf, consumed);
self.absolute_byte_offset += consumed as u64;
self.last_line_counted = 0;
self.last_line_visited =
self.last_line_visited.saturating_sub(consumed);
self.set_pos(self.pos().saturating_sub(consumed));
}
pub(crate) fn detect_binary(
&mut self,
buf: &[u8],

View File

@@ -1,7 +1,9 @@
use grep_matcher::Matcher;
use std::collections::VecDeque;
use crate::{
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader},
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader, alloc_error},
lines::{self, LineStep},
searcher::{Config, Range, Searcher, core::Core},
sink::{Sink, SinkError},
@@ -138,6 +140,348 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> {
}
}
#[derive(Debug)]
pub(crate) struct WindowedMultiLine<'s, M, S> {
config: &'s Config,
core: Core<'s, M, S>,
window_lines: usize,
buf: Vec<u8>,
buf_start: usize,
line_lens: VecDeque<usize>,
abs_start: u64,
current_index: usize,
eof: bool,
}
impl<'s, M: Matcher, S: Sink> WindowedMultiLine<'s, M, S> {
pub(crate) fn new(
searcher: &'s Searcher,
matcher: M,
window_lines: usize,
write_to: S,
) -> WindowedMultiLine<'s, M, S> {
debug_assert!(searcher.multi_line_with_matcher(&matcher));
debug_assert!(window_lines > 0);
WindowedMultiLine {
config: &searcher.config,
core: Core::new(searcher, matcher, write_to, true),
window_lines,
buf: Vec::new(),
buf_start: 0,
line_lens: VecDeque::new(),
abs_start: 0,
current_index: 0,
eof: false,
}
}
pub(crate) fn run_reader<R: std::io::Read>(
mut self,
mut rdr: LineBufferReader<'s, R>,
) -> Result<(), S::Error> {
if self.core.begin()? {
let mut already_binary = rdr.binary_byte_offset().is_some();
while self.fill_reader(&mut rdr, &mut already_binary)?
|| !self.line_lens.is_empty()
{
if !self.process_current_line()? {
break;
}
}
}
let byte_count = self.byte_count();
let binary_byte_offset = self.core.binary_byte_offset();
self.core.finish(byte_count, binary_byte_offset)
}
pub(crate) fn run_slice(mut self, slice: &'s [u8]) -> Result<(), S::Error> {
if self.core.begin()? {
let binary_upto =
std::cmp::min(slice.len(), DEFAULT_BUFFER_CAPACITY);
let binary_range = Range::new(0, binary_upto);
if !self.core.detect_binary(slice, &binary_range)? {
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
0,
slice.len(),
);
while let Some(line) = stepper.next_match(slice) {
self.push_line(&slice[line])?;
}
self.eof = true;
while !self.line_lens.is_empty() {
if !self.process_current_line()? {
break;
}
}
}
}
let byte_count = self.byte_count();
let binary_byte_offset = self.core.binary_byte_offset();
self.core.finish(byte_count, binary_byte_offset)
}
fn fill_reader<R: std::io::Read>(
&mut self,
rdr: &mut LineBufferReader<'s, R>,
already_binary: &mut bool,
) -> Result<bool, S::Error> {
while !self.eof
&& self.line_lens.len() < self.current_index + self.window_lines
{
let didread = match rdr.fill() {
Err(err) => return Err(S::Error::error_io(err)),
Ok(didread) => didread,
};
if !*already_binary {
if let Some(offset) = rdr.binary_byte_offset() {
*already_binary = true;
if !self.core.binary_data(offset)? {
self.eof = true;
return Ok(false);
}
}
}
if !didread {
self.eof = true;
break;
}
let buf = rdr.buffer();
let mut stepper = LineStep::new(
self.config.line_term.as_byte(),
0,
buf.len(),
);
while let Some(line) = stepper.next_match(buf) {
let bytes = &buf[line];
self.push_line(bytes)?;
}
rdr.consume(buf.len());
}
Ok(!self.eof)
}
fn push_line(&mut self, line: &[u8]) -> Result<(), S::Error> {
self.buf.extend_from_slice(line);
self.line_lens.push_back(line.len());
if let Some(limit) = self.config.heap_limit {
let used = self.buf.len() - self.buf_start;
if used > limit {
return Err(S::Error::error_io(alloc_error(limit)));
}
}
Ok(())
}
fn process_current_line(&mut self) -> Result<bool, S::Error> {
if self.current_index >= self.line_lens.len() {
return Ok(false);
}
let window_end =
std::cmp::min(self.line_lens.len(), self.current_index + self.window_lines);
let window_start_off = self.line_offset(self.current_index);
let window_end_off = self.line_offset(window_end);
let line0_len = self.line_lens[self.current_index];
{
let buffer = &self.buf[self.buf_start..];
let window_bytes =
&self.buf[self.buf_start + window_start_off
..self.buf_start + window_end_off];
if self.config.invert_match {
if !sink_inverted_line(
&mut self.core,
self.config,
buffer,
window_bytes,
window_start_off,
line0_len,
)? {
return Ok(false);
}
} else if !sink_matched_line(
&mut self.core,
self.config,
buffer,
window_bytes,
window_start_off,
line0_len,
)? {
return Ok(false);
}
let drop_upto = window_start_off + line0_len;
if self.config.passthru {
if !self.core.other_context_by_line(buffer, drop_upto)? {
return Ok(false);
}
} else if !self.core.after_context_by_line(buffer, drop_upto)? {
return Ok(false);
}
}
self.current_index += 1;
if self.current_index > self.config.before_context {
let drop_len = self.line_lens.pop_front().unwrap();
self.shift_buffer(drop_len);
self.current_index -= 1;
}
if self.eof && self.current_index >= self.line_lens.len() {
let buffer = &self.buf[self.buf_start..];
if self.config.passthru {
if !self.core.other_context_by_line(buffer, buffer.len())? {
return Ok(false);
}
} else if !self.core.after_context_by_line(buffer, buffer.len())? {
return Ok(false);
}
return Ok(false);
}
Ok(true)
}
fn line_offset(&self, idx: usize) -> usize {
self.line_lens.iter().take(idx).sum()
}
fn shift_buffer(&mut self, consumed: usize) {
let buffer = &self.buf[self.buf_start..];
self.core.advance_buffer(buffer, consumed);
self.buf_start += consumed;
self.abs_start += consumed as u64;
if self.buf_start > 0 && self.buf_start > self.buf.len() / 2 {
self.buf.copy_within(self.buf_start.., 0);
let new_len = self.buf.len() - self.buf_start;
self.buf.truncate(new_len);
self.buf_start = 0;
}
}
fn byte_count(&mut self) -> u64 {
match self.core.binary_byte_offset() {
Some(offset) if offset < self.core.pos() as u64 => offset,
_ => self.abs_start + (self.buf.len() - self.buf_start) as u64,
}
}
}
fn sink_matched_line<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
config: &Config,
buffer: &[u8],
window_bytes: &[u8],
window_start_off: usize,
line0_len: usize,
) -> Result<bool, S::Error> {
let mut pos = 0;
let mut last_match: Option<Range> = None;
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
if mat.start() >= line0_len {
break;
}
let line = lines::locate(
window_bytes,
config.line_term.as_byte(),
mat,
)
.offset(window_start_off);
match last_match.take() {
None => {
last_match = Some(line);
}
Some(last) => {
if last.end() >= line.start() {
last_match = Some(last.with_end(line.end()));
} else {
if !sink_context(core, config, buffer, &last)? {
return Ok(false);
}
if !core.matched(buffer, &last)? {
return Ok(false);
}
last_match = Some(line);
}
}
}
pos = mat.end();
if mat.is_empty() && pos < window_bytes.len() {
pos += 1;
}
}
if let Some(last) = last_match.take() {
if !sink_context(core, config, buffer, &last)? {
return Ok(false);
}
if !core.matched(buffer, &last)? {
return Ok(false);
}
}
Ok(true)
}
fn sink_inverted_line<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
config: &Config,
buffer: &[u8],
window_bytes: &[u8],
window_start_off: usize,
line0_len: usize,
) -> Result<bool, S::Error> {
let mut pos = 0;
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
if mat.start() >= line0_len {
break;
}
if mat.start() < line0_len {
return Ok(true);
}
pos = mat.end();
if mat.is_empty() && pos < window_bytes.len() {
pos += 1;
}
}
let line = Range::new(window_start_off, window_start_off + line0_len);
if !sink_context(core, config, buffer, &line)? {
return Ok(false);
}
if !core.matched(buffer, &line)? {
return Ok(false);
}
Ok(true)
}
fn find_in_window<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
window_bytes: &[u8],
pos: usize,
) -> Result<Option<Range>, S::Error> {
core.find(&window_bytes[pos..])
.map(|m| m.map(|m| m.offset(pos)))
}
fn sink_context<M: Matcher, S: Sink>(
core: &mut Core<'_, M, S>,
config: &Config,
buffer: &[u8],
range: &Range,
) -> Result<bool, S::Error> {
if config.passthru {
if !core.other_context_by_line(buffer, range.start())? {
return Ok(false);
}
} else {
if !core.after_context_by_line(buffer, range.start())? {
return Ok(false);
}
if !core.before_context_by_line(buffer, range.start())? {
return Ok(false);
}
}
Ok(true)
}
#[derive(Debug)]
pub(crate) struct MultiLine<'s, M, S> {
config: &'s Config,
@@ -518,6 +862,37 @@ byte count:366
.test();
}
#[test]
fn multi_line_window_limits_match() {
let haystack = "a\nb\nc\nd\n";
let matcher = RegexMatcher::new("a\nb\nc");
let mut builder = SearcherBuilder::new();
builder.multi_line(true).multiline_window(Some(2)).line_number(false);
let mut sink = KitchenSink::new();
let mut searcher = builder.build();
searcher
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
.unwrap();
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
let exp = format!("\nbyte count:{}\n", haystack.len());
assert_eq!(exp, got);
let mut builder = SearcherBuilder::new();
builder.multi_line(true).multiline_window(Some(3)).line_number(false);
let mut sink = KitchenSink::new();
let mut searcher = builder.build();
searcher
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
.unwrap();
let exp = format!(
"0:a\n2:b\n4:c\n\nbyte count:{}\n",
haystack.len()
);
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
assert_eq!(exp, got);
}
#[test]
fn multi_line_overlap2() {
let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx";

View File

@@ -16,7 +16,7 @@ use crate::{
self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer,
LineBufferBuilder, LineBufferReader, alloc_error,
},
searcher::glue::{MultiLine, ReadByLine, SliceByLine},
searcher::glue::{MultiLine, ReadByLine, SliceByLine, WindowedMultiLine},
sink::{Sink, SinkError},
};
@@ -172,6 +172,8 @@ pub struct Config {
binary: BinaryDetection,
/// Whether to enable matching across multiple lines.
multi_line: bool,
/// The maximum number of lines a multi-line match may span.
multiline_window: Option<usize>,
/// An encoding that, when present, causes the searcher to transcode all
/// input from the encoding to UTF-8.
encoding: Option<Encoding>,
@@ -197,6 +199,7 @@ impl Default for Config {
mmap: MmapChoice::default(),
binary: BinaryDetection::default(),
multi_line: false,
multiline_window: None,
encoding: None,
bom_sniffing: true,
stop_on_nonmatch: false,
@@ -390,6 +393,15 @@ impl SearcherBuilder {
self
}
/// Limit multi-line matches to a window of at most `line_count` lines.
pub fn multiline_window(
&mut self,
line_count: Option<usize>,
) -> &mut SearcherBuilder {
self.config.multiline_window = line_count;
self
}
/// Whether to include a fixed number of lines after every match.
///
/// When this is set to a non-zero number, then the searcher will report
@@ -694,6 +706,13 @@ impl Searcher {
// enabled. This pre-allocates a buffer roughly the size of the file,
// which isn't possible when searching an arbitrary std::io::Read.
if self.multi_line_with_matcher(&matcher) {
if self.config.multiline_window.is_some() {
log::trace!(
"{:?}: searching via windowed multiline strategy",
path
);
return self.search_reader(matcher, file, write_to);
}
log::trace!(
"{:?}: reading entire file on to heap for mulitline",
path
@@ -744,6 +763,18 @@ impl Searcher {
.map_err(S::Error::error_io)?;
if self.multi_line_with_matcher(&matcher) {
if let Some(window_lines) = self.config.multiline_window {
let mut line_buffer = self.line_buffer.borrow_mut();
let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
log::trace!("generic reader: searching via windowed multiline");
return WindowedMultiLine::new(
self,
matcher,
window_lines,
write_to,
)
.run_reader(rdr);
}
log::trace!(
"generic reader: reading everything to heap for multiline"
);
@@ -786,6 +817,16 @@ impl Searcher {
return self.search_reader(matcher, slice, write_to);
}
if self.multi_line_with_matcher(&matcher) {
if let Some(window_lines) = self.config.multiline_window {
log::trace!("slice reader: searching via windowed multiline");
return WindowedMultiLine::new(
self,
matcher,
window_lines,
write_to,
)
.run_slice(slice);
}
log::trace!("slice reader: searching via multiline strategy");
MultiLine::new(self, matcher, slice, write_to).run()
} else {
@@ -865,6 +906,12 @@ impl Searcher {
self.config.multi_line
}
/// Returns the maximum number of lines a multi-line match may span.
#[inline]
pub fn multiline_window(&self) -> Option<usize> {
self.config.multiline_window
}
/// Returns true if and only if this searcher is configured to stop when it
/// finds a non-matching line after a matching one.
#[inline]