rgs: added multiline window limit and in-file result indexing (work in progress)
This commit is contained in:
@@ -212,6 +212,18 @@ impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
|
||||
consumed
|
||||
}
|
||||
|
||||
pub(crate) fn advance_buffer(&mut self, buf: &[u8], consumed: usize) {
|
||||
if consumed == 0 {
|
||||
return;
|
||||
}
|
||||
self.count_lines(buf, consumed);
|
||||
self.absolute_byte_offset += consumed as u64;
|
||||
self.last_line_counted = 0;
|
||||
self.last_line_visited =
|
||||
self.last_line_visited.saturating_sub(consumed);
|
||||
self.set_pos(self.pos().saturating_sub(consumed));
|
||||
}
|
||||
|
||||
pub(crate) fn detect_binary(
|
||||
&mut self,
|
||||
buf: &[u8],
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use grep_matcher::Matcher;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use crate::{
|
||||
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader},
|
||||
line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader, alloc_error},
|
||||
lines::{self, LineStep},
|
||||
searcher::{Config, Range, Searcher, core::Core},
|
||||
sink::{Sink, SinkError},
|
||||
@@ -138,6 +140,348 @@ impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct WindowedMultiLine<'s, M, S> {
|
||||
config: &'s Config,
|
||||
core: Core<'s, M, S>,
|
||||
window_lines: usize,
|
||||
buf: Vec<u8>,
|
||||
buf_start: usize,
|
||||
line_lens: VecDeque<usize>,
|
||||
abs_start: u64,
|
||||
current_index: usize,
|
||||
eof: bool,
|
||||
}
|
||||
|
||||
impl<'s, M: Matcher, S: Sink> WindowedMultiLine<'s, M, S> {
|
||||
pub(crate) fn new(
|
||||
searcher: &'s Searcher,
|
||||
matcher: M,
|
||||
window_lines: usize,
|
||||
write_to: S,
|
||||
) -> WindowedMultiLine<'s, M, S> {
|
||||
debug_assert!(searcher.multi_line_with_matcher(&matcher));
|
||||
debug_assert!(window_lines > 0);
|
||||
|
||||
WindowedMultiLine {
|
||||
config: &searcher.config,
|
||||
core: Core::new(searcher, matcher, write_to, true),
|
||||
window_lines,
|
||||
buf: Vec::new(),
|
||||
buf_start: 0,
|
||||
line_lens: VecDeque::new(),
|
||||
abs_start: 0,
|
||||
current_index: 0,
|
||||
eof: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn run_reader<R: std::io::Read>(
|
||||
mut self,
|
||||
mut rdr: LineBufferReader<'s, R>,
|
||||
) -> Result<(), S::Error> {
|
||||
if self.core.begin()? {
|
||||
let mut already_binary = rdr.binary_byte_offset().is_some();
|
||||
while self.fill_reader(&mut rdr, &mut already_binary)?
|
||||
|| !self.line_lens.is_empty()
|
||||
{
|
||||
if !self.process_current_line()? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
let byte_count = self.byte_count();
|
||||
let binary_byte_offset = self.core.binary_byte_offset();
|
||||
self.core.finish(byte_count, binary_byte_offset)
|
||||
}
|
||||
|
||||
pub(crate) fn run_slice(mut self, slice: &'s [u8]) -> Result<(), S::Error> {
|
||||
if self.core.begin()? {
|
||||
let binary_upto =
|
||||
std::cmp::min(slice.len(), DEFAULT_BUFFER_CAPACITY);
|
||||
let binary_range = Range::new(0, binary_upto);
|
||||
if !self.core.detect_binary(slice, &binary_range)? {
|
||||
let mut stepper = LineStep::new(
|
||||
self.config.line_term.as_byte(),
|
||||
0,
|
||||
slice.len(),
|
||||
);
|
||||
while let Some(line) = stepper.next_match(slice) {
|
||||
self.push_line(&slice[line])?;
|
||||
}
|
||||
self.eof = true;
|
||||
while !self.line_lens.is_empty() {
|
||||
if !self.process_current_line()? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
let byte_count = self.byte_count();
|
||||
let binary_byte_offset = self.core.binary_byte_offset();
|
||||
self.core.finish(byte_count, binary_byte_offset)
|
||||
}
|
||||
|
||||
fn fill_reader<R: std::io::Read>(
|
||||
&mut self,
|
||||
rdr: &mut LineBufferReader<'s, R>,
|
||||
already_binary: &mut bool,
|
||||
) -> Result<bool, S::Error> {
|
||||
while !self.eof
|
||||
&& self.line_lens.len() < self.current_index + self.window_lines
|
||||
{
|
||||
let didread = match rdr.fill() {
|
||||
Err(err) => return Err(S::Error::error_io(err)),
|
||||
Ok(didread) => didread,
|
||||
};
|
||||
if !*already_binary {
|
||||
if let Some(offset) = rdr.binary_byte_offset() {
|
||||
*already_binary = true;
|
||||
if !self.core.binary_data(offset)? {
|
||||
self.eof = true;
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
if !didread {
|
||||
self.eof = true;
|
||||
break;
|
||||
}
|
||||
let buf = rdr.buffer();
|
||||
let mut stepper = LineStep::new(
|
||||
self.config.line_term.as_byte(),
|
||||
0,
|
||||
buf.len(),
|
||||
);
|
||||
while let Some(line) = stepper.next_match(buf) {
|
||||
let bytes = &buf[line];
|
||||
self.push_line(bytes)?;
|
||||
}
|
||||
rdr.consume(buf.len());
|
||||
}
|
||||
Ok(!self.eof)
|
||||
}
|
||||
|
||||
fn push_line(&mut self, line: &[u8]) -> Result<(), S::Error> {
|
||||
self.buf.extend_from_slice(line);
|
||||
self.line_lens.push_back(line.len());
|
||||
if let Some(limit) = self.config.heap_limit {
|
||||
let used = self.buf.len() - self.buf_start;
|
||||
if used > limit {
|
||||
return Err(S::Error::error_io(alloc_error(limit)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_current_line(&mut self) -> Result<bool, S::Error> {
|
||||
if self.current_index >= self.line_lens.len() {
|
||||
return Ok(false);
|
||||
}
|
||||
let window_end =
|
||||
std::cmp::min(self.line_lens.len(), self.current_index + self.window_lines);
|
||||
let window_start_off = self.line_offset(self.current_index);
|
||||
let window_end_off = self.line_offset(window_end);
|
||||
let line0_len = self.line_lens[self.current_index];
|
||||
|
||||
{
|
||||
let buffer = &self.buf[self.buf_start..];
|
||||
let window_bytes =
|
||||
&self.buf[self.buf_start + window_start_off
|
||||
..self.buf_start + window_end_off];
|
||||
if self.config.invert_match {
|
||||
if !sink_inverted_line(
|
||||
&mut self.core,
|
||||
self.config,
|
||||
buffer,
|
||||
window_bytes,
|
||||
window_start_off,
|
||||
line0_len,
|
||||
)? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else if !sink_matched_line(
|
||||
&mut self.core,
|
||||
self.config,
|
||||
buffer,
|
||||
window_bytes,
|
||||
window_start_off,
|
||||
line0_len,
|
||||
)? {
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let drop_upto = window_start_off + line0_len;
|
||||
if self.config.passthru {
|
||||
if !self.core.other_context_by_line(buffer, drop_upto)? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else if !self.core.after_context_by_line(buffer, drop_upto)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
|
||||
self.current_index += 1;
|
||||
if self.current_index > self.config.before_context {
|
||||
let drop_len = self.line_lens.pop_front().unwrap();
|
||||
self.shift_buffer(drop_len);
|
||||
self.current_index -= 1;
|
||||
}
|
||||
|
||||
if self.eof && self.current_index >= self.line_lens.len() {
|
||||
let buffer = &self.buf[self.buf_start..];
|
||||
if self.config.passthru {
|
||||
if !self.core.other_context_by_line(buffer, buffer.len())? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else if !self.core.after_context_by_line(buffer, buffer.len())? {
|
||||
return Ok(false);
|
||||
}
|
||||
return Ok(false);
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn line_offset(&self, idx: usize) -> usize {
|
||||
self.line_lens.iter().take(idx).sum()
|
||||
}
|
||||
|
||||
fn shift_buffer(&mut self, consumed: usize) {
|
||||
let buffer = &self.buf[self.buf_start..];
|
||||
self.core.advance_buffer(buffer, consumed);
|
||||
self.buf_start += consumed;
|
||||
self.abs_start += consumed as u64;
|
||||
if self.buf_start > 0 && self.buf_start > self.buf.len() / 2 {
|
||||
self.buf.copy_within(self.buf_start.., 0);
|
||||
let new_len = self.buf.len() - self.buf_start;
|
||||
self.buf.truncate(new_len);
|
||||
self.buf_start = 0;
|
||||
}
|
||||
}
|
||||
|
||||
fn byte_count(&mut self) -> u64 {
|
||||
match self.core.binary_byte_offset() {
|
||||
Some(offset) if offset < self.core.pos() as u64 => offset,
|
||||
_ => self.abs_start + (self.buf.len() - self.buf_start) as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn sink_matched_line<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
config: &Config,
|
||||
buffer: &[u8],
|
||||
window_bytes: &[u8],
|
||||
window_start_off: usize,
|
||||
line0_len: usize,
|
||||
) -> Result<bool, S::Error> {
|
||||
let mut pos = 0;
|
||||
let mut last_match: Option<Range> = None;
|
||||
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
|
||||
if mat.start() >= line0_len {
|
||||
break;
|
||||
}
|
||||
let line = lines::locate(
|
||||
window_bytes,
|
||||
config.line_term.as_byte(),
|
||||
mat,
|
||||
)
|
||||
.offset(window_start_off);
|
||||
match last_match.take() {
|
||||
None => {
|
||||
last_match = Some(line);
|
||||
}
|
||||
Some(last) => {
|
||||
if last.end() >= line.start() {
|
||||
last_match = Some(last.with_end(line.end()));
|
||||
} else {
|
||||
if !sink_context(core, config, buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.matched(buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
last_match = Some(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
pos = mat.end();
|
||||
if mat.is_empty() && pos < window_bytes.len() {
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
if let Some(last) = last_match.take() {
|
||||
if !sink_context(core, config, buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.matched(buffer, &last)? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn sink_inverted_line<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
config: &Config,
|
||||
buffer: &[u8],
|
||||
window_bytes: &[u8],
|
||||
window_start_off: usize,
|
||||
line0_len: usize,
|
||||
) -> Result<bool, S::Error> {
|
||||
let mut pos = 0;
|
||||
while let Some(mat) = find_in_window(core, window_bytes, pos)? {
|
||||
if mat.start() >= line0_len {
|
||||
break;
|
||||
}
|
||||
if mat.start() < line0_len {
|
||||
return Ok(true);
|
||||
}
|
||||
pos = mat.end();
|
||||
if mat.is_empty() && pos < window_bytes.len() {
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
let line = Range::new(window_start_off, window_start_off + line0_len);
|
||||
if !sink_context(core, config, buffer, &line)? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.matched(buffer, &line)? {
|
||||
return Ok(false);
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
fn find_in_window<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
window_bytes: &[u8],
|
||||
pos: usize,
|
||||
) -> Result<Option<Range>, S::Error> {
|
||||
core.find(&window_bytes[pos..])
|
||||
.map(|m| m.map(|m| m.offset(pos)))
|
||||
}
|
||||
|
||||
fn sink_context<M: Matcher, S: Sink>(
|
||||
core: &mut Core<'_, M, S>,
|
||||
config: &Config,
|
||||
buffer: &[u8],
|
||||
range: &Range,
|
||||
) -> Result<bool, S::Error> {
|
||||
if config.passthru {
|
||||
if !core.other_context_by_line(buffer, range.start())? {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
if !core.after_context_by_line(buffer, range.start())? {
|
||||
return Ok(false);
|
||||
}
|
||||
if !core.before_context_by_line(buffer, range.start())? {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct MultiLine<'s, M, S> {
|
||||
config: &'s Config,
|
||||
@@ -518,6 +862,37 @@ byte count:366
|
||||
.test();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_line_window_limits_match() {
|
||||
let haystack = "a\nb\nc\nd\n";
|
||||
let matcher = RegexMatcher::new("a\nb\nc");
|
||||
|
||||
let mut builder = SearcherBuilder::new();
|
||||
builder.multi_line(true).multiline_window(Some(2)).line_number(false);
|
||||
let mut sink = KitchenSink::new();
|
||||
let mut searcher = builder.build();
|
||||
searcher
|
||||
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
|
||||
.unwrap();
|
||||
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
|
||||
let exp = format!("\nbyte count:{}\n", haystack.len());
|
||||
assert_eq!(exp, got);
|
||||
|
||||
let mut builder = SearcherBuilder::new();
|
||||
builder.multi_line(true).multiline_window(Some(3)).line_number(false);
|
||||
let mut sink = KitchenSink::new();
|
||||
let mut searcher = builder.build();
|
||||
searcher
|
||||
.search_slice(&matcher, haystack.as_bytes(), &mut sink)
|
||||
.unwrap();
|
||||
let exp = format!(
|
||||
"0:a\n2:b\n4:c\n\nbyte count:{}\n",
|
||||
haystack.len()
|
||||
);
|
||||
let got = String::from_utf8(sink.as_bytes().to_vec()).unwrap();
|
||||
assert_eq!(exp, got);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_line_overlap2() {
|
||||
let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx";
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::{
|
||||
self, BufferAllocation, DEFAULT_BUFFER_CAPACITY, LineBuffer,
|
||||
LineBufferBuilder, LineBufferReader, alloc_error,
|
||||
},
|
||||
searcher::glue::{MultiLine, ReadByLine, SliceByLine},
|
||||
searcher::glue::{MultiLine, ReadByLine, SliceByLine, WindowedMultiLine},
|
||||
sink::{Sink, SinkError},
|
||||
};
|
||||
|
||||
@@ -172,6 +172,8 @@ pub struct Config {
|
||||
binary: BinaryDetection,
|
||||
/// Whether to enable matching across multiple lines.
|
||||
multi_line: bool,
|
||||
/// The maximum number of lines a multi-line match may span.
|
||||
multiline_window: Option<usize>,
|
||||
/// An encoding that, when present, causes the searcher to transcode all
|
||||
/// input from the encoding to UTF-8.
|
||||
encoding: Option<Encoding>,
|
||||
@@ -197,6 +199,7 @@ impl Default for Config {
|
||||
mmap: MmapChoice::default(),
|
||||
binary: BinaryDetection::default(),
|
||||
multi_line: false,
|
||||
multiline_window: None,
|
||||
encoding: None,
|
||||
bom_sniffing: true,
|
||||
stop_on_nonmatch: false,
|
||||
@@ -390,6 +393,15 @@ impl SearcherBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Limit multi-line matches to a window of at most `line_count` lines.
|
||||
pub fn multiline_window(
|
||||
&mut self,
|
||||
line_count: Option<usize>,
|
||||
) -> &mut SearcherBuilder {
|
||||
self.config.multiline_window = line_count;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to include a fixed number of lines after every match.
|
||||
///
|
||||
/// When this is set to a non-zero number, then the searcher will report
|
||||
@@ -694,6 +706,13 @@ impl Searcher {
|
||||
// enabled. This pre-allocates a buffer roughly the size of the file,
|
||||
// which isn't possible when searching an arbitrary std::io::Read.
|
||||
if self.multi_line_with_matcher(&matcher) {
|
||||
if self.config.multiline_window.is_some() {
|
||||
log::trace!(
|
||||
"{:?}: searching via windowed multiline strategy",
|
||||
path
|
||||
);
|
||||
return self.search_reader(matcher, file, write_to);
|
||||
}
|
||||
log::trace!(
|
||||
"{:?}: reading entire file on to heap for mulitline",
|
||||
path
|
||||
@@ -744,6 +763,18 @@ impl Searcher {
|
||||
.map_err(S::Error::error_io)?;
|
||||
|
||||
if self.multi_line_with_matcher(&matcher) {
|
||||
if let Some(window_lines) = self.config.multiline_window {
|
||||
let mut line_buffer = self.line_buffer.borrow_mut();
|
||||
let rdr = LineBufferReader::new(decoder, &mut *line_buffer);
|
||||
log::trace!("generic reader: searching via windowed multiline");
|
||||
return WindowedMultiLine::new(
|
||||
self,
|
||||
matcher,
|
||||
window_lines,
|
||||
write_to,
|
||||
)
|
||||
.run_reader(rdr);
|
||||
}
|
||||
log::trace!(
|
||||
"generic reader: reading everything to heap for multiline"
|
||||
);
|
||||
@@ -786,6 +817,16 @@ impl Searcher {
|
||||
return self.search_reader(matcher, slice, write_to);
|
||||
}
|
||||
if self.multi_line_with_matcher(&matcher) {
|
||||
if let Some(window_lines) = self.config.multiline_window {
|
||||
log::trace!("slice reader: searching via windowed multiline");
|
||||
return WindowedMultiLine::new(
|
||||
self,
|
||||
matcher,
|
||||
window_lines,
|
||||
write_to,
|
||||
)
|
||||
.run_slice(slice);
|
||||
}
|
||||
log::trace!("slice reader: searching via multiline strategy");
|
||||
MultiLine::new(self, matcher, slice, write_to).run()
|
||||
} else {
|
||||
@@ -865,6 +906,12 @@ impl Searcher {
|
||||
self.config.multi_line
|
||||
}
|
||||
|
||||
/// Returns the maximum number of lines a multi-line match may span.
|
||||
#[inline]
|
||||
pub fn multiline_window(&self) -> Option<usize> {
|
||||
self.config.multiline_window
|
||||
}
|
||||
|
||||
/// Returns true if and only if this searcher is configured to stop when it
|
||||
/// finds a non-matching line after a matching one.
|
||||
#[inline]
|
||||
|
||||
Reference in New Issue
Block a user