Compare commits
167 Commits
0.9.0
...
grep-regex
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c84cfb6756 | ||
|
|
895e26a000 | ||
|
|
8c95290ff6 | ||
|
|
d6feeb7ff2 | ||
|
|
626ed00c19 | ||
|
|
332ad18401 | ||
|
|
fc3cf41247 | ||
|
|
a4868b8835 | ||
|
|
f99b991117 | ||
|
|
de0bc78982 | ||
|
|
147e96914c | ||
|
|
0abc40c23c | ||
|
|
f768796e4f | ||
|
|
da0c0c4705 | ||
|
|
05411b2b32 | ||
|
|
cc93db3b18 | ||
|
|
049354b766 | ||
|
|
386dd2806d | ||
|
|
5fe9a954e6 | ||
|
|
f158a42a71 | ||
|
|
5724391d39 | ||
|
|
0df71240ff | ||
|
|
f3164f2615 | ||
|
|
31d3e24130 | ||
|
|
bf842dbc7f | ||
|
|
6d5dba85bd | ||
|
|
afb89bcdad | ||
|
|
332dc56372 | ||
|
|
12a6ca45f9 | ||
|
|
9d703110cf | ||
|
|
e99b6bda0e | ||
|
|
276e2c9b9a | ||
|
|
9a9f54d44c | ||
|
|
47833b9ce7 | ||
|
|
44a9e37737 | ||
|
|
8fd05cacee | ||
|
|
4691d11034 | ||
|
|
519a6b68af | ||
|
|
9c940b45f4 | ||
|
|
0a167021c3 | ||
|
|
aeaa5fc1b1 | ||
|
|
7048a06c31 | ||
|
|
23be3cf850 | ||
|
|
b48bbf527d | ||
|
|
8eabe47b57 | ||
|
|
ff712bfd9d | ||
|
|
a7f2d48234 | ||
|
|
57500ad013 | ||
|
|
0b04553aff | ||
|
|
1ae121122f | ||
|
|
688003e51c | ||
|
|
718a00f6f2 | ||
|
|
7cbc535d70 | ||
|
|
7a6a40bae1 | ||
|
|
1e9ee2cc85 | ||
|
|
968491f8e9 | ||
|
|
63b0f31a22 | ||
|
|
7ecee299a5 | ||
|
|
dd396ff34e | ||
|
|
fb0a82f3c3 | ||
|
|
dbc8ca9cc1 | ||
|
|
c3db8db93d | ||
|
|
17ef4c40f3 | ||
|
|
a9e0477ea8 | ||
|
|
b3c5773266 | ||
|
|
118b950085 | ||
|
|
b45b2f58ea | ||
|
|
662a9bc73d | ||
|
|
401add0a99 | ||
|
|
f81b72721b | ||
|
|
1d4fccaadc | ||
|
|
09e464e674 | ||
|
|
31adff6f3c | ||
|
|
b41e596327 | ||
|
|
fb62266620 | ||
|
|
acf226c39d | ||
|
|
8299625e48 | ||
|
|
db256c87eb | ||
|
|
ba533f390e | ||
|
|
ba503eb677 | ||
|
|
f72c2dfd90 | ||
|
|
c0aa58b4f7 | ||
|
|
184ee4c328 | ||
|
|
e82fbf2c46 | ||
|
|
eb18da0450 | ||
|
|
0f7494216f | ||
|
|
442a278635 | ||
|
|
7ebed3ace6 | ||
|
|
8a7db1a918 | ||
|
|
ce80d794c0 | ||
|
|
c5d467a2ab | ||
|
|
a62cd553c2 | ||
|
|
ce5188335b | ||
|
|
b7a456ae83 | ||
|
|
d14f0b37d6 | ||
|
|
3ddc3c040f | ||
|
|
eeaa42ecaf | ||
|
|
3797a2a5cb | ||
|
|
0e2f8f7b47 | ||
|
|
3dd4b77dfb | ||
|
|
3b5cdea862 | ||
|
|
54b3e9eb10 | ||
|
|
56e8864426 | ||
|
|
b8f619d16e | ||
|
|
83dff33326 | ||
|
|
003c3695f4 | ||
|
|
10777c150d | ||
|
|
827179250b | ||
|
|
fd22cd520b | ||
|
|
241bc8f8fc | ||
|
|
b6e30124e0 | ||
|
|
4846d63539 | ||
|
|
13c47530a6 | ||
|
|
328f4369e6 | ||
|
|
04518e32e7 | ||
|
|
f2eaf5b977 | ||
|
|
3edeeca6e9 | ||
|
|
c41b353009 | ||
|
|
d18839f3dc | ||
|
|
8f978a3cf7 | ||
|
|
87b745454d | ||
|
|
e5bb750995 | ||
|
|
d599f0b3c7 | ||
|
|
40e310a9f9 | ||
|
|
510f15f4da | ||
|
|
f9ce7a84a8 | ||
|
|
1b6089674e | ||
|
|
05a0389555 | ||
|
|
16353bad6e | ||
|
|
fe442de091 | ||
|
|
1bb8b7170f | ||
|
|
55ed698a98 | ||
|
|
f1e025873f | ||
|
|
033ad2b8e4 | ||
|
|
098a8ee843 | ||
|
|
2f3dbf5fee | ||
|
|
5c80e4adb6 | ||
|
|
fcd1853031 | ||
|
|
74a89be641 | ||
|
|
5b1ce8bdc2 | ||
|
|
1529ce3341 | ||
|
|
95a4f15916 | ||
|
|
0eef05142a | ||
|
|
edd6eb4e06 | ||
|
|
7ac9782970 | ||
|
|
180054d7dc | ||
|
|
7eaaa04c69 | ||
|
|
87a627631c | ||
|
|
9df60e164e | ||
|
|
afa06c518a | ||
|
|
e46aeb34f8 | ||
|
|
d8f187e990 | ||
|
|
7d93d2ab05 | ||
|
|
9ca2d68e94 | ||
|
|
60b0e3ff80 | ||
|
|
3a1c081c13 | ||
|
|
d5c0b03030 | ||
|
|
eb184d7711 | ||
|
|
bb110c1ebe | ||
|
|
d9ca529356 | ||
|
|
0958837ee1 | ||
|
|
94be3bd4bb | ||
|
|
deb1de6e1e | ||
|
|
6afdf15d85 | ||
|
|
6cda7b24e9 | ||
|
|
ad9befbc1d | ||
|
|
e86d3d95c2 |
10
.travis.yml
10
.travis.yml
@@ -17,6 +17,8 @@ addons:
|
|||||||
# Needed for testing decompression search.
|
# Needed for testing decompression search.
|
||||||
- xz-utils
|
- xz-utils
|
||||||
- liblz4-tool
|
- liblz4-tool
|
||||||
|
# For building MUSL static builds on Linux.
|
||||||
|
- musl-tools
|
||||||
matrix:
|
matrix:
|
||||||
fast_finish: true
|
fast_finish: true
|
||||||
include:
|
include:
|
||||||
@@ -60,13 +62,13 @@ matrix:
|
|||||||
# Minimum Rust supported channel. We enable these to make sure ripgrep
|
# Minimum Rust supported channel. We enable these to make sure ripgrep
|
||||||
# continues to work on the advertised minimum Rust version.
|
# continues to work on the advertised minimum Rust version.
|
||||||
- os: linux
|
- os: linux
|
||||||
rust: 1.23.0
|
rust: 1.32.0
|
||||||
env: TARGET=x86_64-unknown-linux-gnu
|
env: TARGET=x86_64-unknown-linux-gnu
|
||||||
- os: linux
|
- os: linux
|
||||||
rust: 1.23.0
|
rust: 1.32.0
|
||||||
env: TARGET=x86_64-unknown-linux-musl
|
env: TARGET=x86_64-unknown-linux-musl
|
||||||
- os: linux
|
- os: linux
|
||||||
rust: 1.23.0
|
rust: 1.32.0
|
||||||
env: TARGET=arm-unknown-linux-gnueabihf GCC_VERSION=4.8
|
env: TARGET=arm-unknown-linux-gnueabihf GCC_VERSION=4.8
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
@@ -91,7 +93,7 @@ deploy:
|
|||||||
skip_cleanup: true
|
skip_cleanup: true
|
||||||
on:
|
on:
|
||||||
condition: $TRAVIS_RUST_VERSION = nightly
|
condition: $TRAVIS_RUST_VERSION = nightly
|
||||||
branch: master
|
branch: master # i guess we do need this after all?
|
||||||
tags: true
|
tags: true
|
||||||
api_key:
|
api_key:
|
||||||
secure: "IbSnsbGkxSydR/sozOf1/SRvHplzwRUHzcTjM7BKnr7GccL86gRPUrsrvD103KjQUGWIc1TnK1YTq5M0Onswg/ORDjqa1JEJPkPdPnVh9ipbF7M2De/7IlB4X4qXLKoApn8+bx2x/mfYXu4G+G1/2QdbaKK2yfXZKyjz0YFx+6CNrVCT2Nk8q7aHvOOzAL58vsG8iPDpupuhxlMDDn/UhyOWVInmPPQ0iJR1ZUJN8xJwXvKvBbfp3AhaBiAzkhXHNLgBR8QC5noWWMXnuVDMY3k4f3ic0V+p/qGUCN/nhptuceLxKFicMCYObSZeUzE5RAI0/OBW7l3z2iCoc+TbAnn+JrX/ObJCfzgAOXAU3tLaBFMiqQPGFKjKg1ltSYXomOFP/F7zALjpvFp4lYTBajRR+O3dqaxA9UQuRjw27vOeUpMcga4ZzL4VXFHzrxZKBHN//XIGjYAVhJ1NSSeGpeJV5/+jYzzWKfwSagRxQyVCzMooYFFXzn8Yxdm3PJlmp3GaAogNkdB9qKcrEvRINCelalzALPi0hD/HUDi8DD2PNTCLLMo6VSYtvc685Zbe+KgNzDV1YyTrRCUW6JotrS0r2ULLwnsh40hSB//nNv3XmwNmC/CmW5QAnIGj8cBMF4S2t6ohADIndojdAfNiptmaZOIT6owK7bWMgPMyopo="
|
secure: "IbSnsbGkxSydR/sozOf1/SRvHplzwRUHzcTjM7BKnr7GccL86gRPUrsrvD103KjQUGWIc1TnK1YTq5M0Onswg/ORDjqa1JEJPkPdPnVh9ipbF7M2De/7IlB4X4qXLKoApn8+bx2x/mfYXu4G+G1/2QdbaKK2yfXZKyjz0YFx+6CNrVCT2Nk8q7aHvOOzAL58vsG8iPDpupuhxlMDDn/UhyOWVInmPPQ0iJR1ZUJN8xJwXvKvBbfp3AhaBiAzkhXHNLgBR8QC5noWWMXnuVDMY3k4f3ic0V+p/qGUCN/nhptuceLxKFicMCYObSZeUzE5RAI0/OBW7l3z2iCoc+TbAnn+JrX/ObJCfzgAOXAU3tLaBFMiqQPGFKjKg1ltSYXomOFP/F7zALjpvFp4lYTBajRR+O3dqaxA9UQuRjw27vOeUpMcga4ZzL4VXFHzrxZKBHN//XIGjYAVhJ1NSSeGpeJV5/+jYzzWKfwSagRxQyVCzMooYFFXzn8Yxdm3PJlmp3GaAogNkdB9qKcrEvRINCelalzALPi0hD/HUDi8DD2PNTCLLMo6VSYtvc685Zbe+KgNzDV1YyTrRCUW6JotrS0r2ULLwnsh40hSB//nNv3XmwNmC/CmW5QAnIGj8cBMF4S2t6ohADIndojdAfNiptmaZOIT6owK7bWMgPMyopo="
|
||||||
|
|||||||
149
CHANGELOG.md
149
CHANGELOG.md
@@ -1,3 +1,150 @@
|
|||||||
|
0.11.0 (TBD)
|
||||||
|
============
|
||||||
|
TODO.
|
||||||
|
|
||||||
|
**BREAKING CHANGES**:
|
||||||
|
|
||||||
|
* ripgrep has tweaked its exit status codes to be more like GNU grep's. Namely,
|
||||||
|
if a non-fatal error occurs during a search, then ripgrep will now always
|
||||||
|
emit a `2` exit status code, regardless of whether a match is found or not.
|
||||||
|
Previously, ripgrep would only emit a `2` exit status code for a catastrophic
|
||||||
|
error (e.g., regex syntax error). One exception to this is if ripgrep is run
|
||||||
|
with `-q/--quiet`. In that case, if an error occurs and a match is found,
|
||||||
|
then ripgrep will exit with a `0` exit status code.
|
||||||
|
* The `avx-accel` feature of ripgrep has been removed since it is no longer
|
||||||
|
necessary. All uses of AVX in ripgrep are now enabled automatically via
|
||||||
|
runtime CPU feature detection. The `simd-accel` feature does remain
|
||||||
|
available, however, it does increase compilation times substantially at the
|
||||||
|
moment.
|
||||||
|
|
||||||
|
Feature enhancements:
|
||||||
|
|
||||||
|
* [FEATURE #1099](https://github.com/BurntSushi/ripgrep/pull/1099):
|
||||||
|
Add support for Brotli and Zstd to the `-z/--search-zip` flag.
|
||||||
|
* [FEATURE #1138](https://github.com/BurntSushi/ripgrep/pull/1138):
|
||||||
|
Add `--no-ignore-dot` flag for ignoring `.ignore` files.
|
||||||
|
* [FEATURE #1159](https://github.com/BurntSushi/ripgrep/pull/1159):
|
||||||
|
ripgrep's exit status logic should now match GNU grep. See updated man page.
|
||||||
|
* [FEATURE #1170](https://github.com/BurntSushi/ripgrep/pull/1170):
|
||||||
|
Add `--ignore-file-case-insensitive` for case insensitive .ignore globs.
|
||||||
|
|
||||||
|
Bug fixes:
|
||||||
|
|
||||||
|
* [BUG #373](https://github.com/BurntSushi/ripgrep/issues/373),
|
||||||
|
[BUG #1098](https://github.com/BurntSushi/ripgrep/issues/1098):
|
||||||
|
`**` is now accepted as valid syntax anywhere in a glob.
|
||||||
|
* [BUG #916](https://github.com/BurntSushi/ripgrep/issues/916):
|
||||||
|
ripgrep no longer hangs when searching `/proc` with a zombie process present.
|
||||||
|
* [BUG #1091](https://github.com/BurntSushi/ripgrep/issues/1091):
|
||||||
|
Add note about inverted flags to the man page.
|
||||||
|
* [BUG #1095](https://github.com/BurntSushi/ripgrep/issues/1095):
|
||||||
|
Fix corner cases involving the `--crlf` flag.
|
||||||
|
* [BUG #1103](https://github.com/BurntSushi/ripgrep/issues/1103):
|
||||||
|
Clarify what `--encoding auto` does.
|
||||||
|
* [BUG #1106](https://github.com/BurntSushi/ripgrep/issues/1106):
|
||||||
|
`--files-with-matches` and `--files-without-match` work with one file.
|
||||||
|
* [BUG #1093](https://github.com/BurntSushi/ripgrep/pull/1093):
|
||||||
|
Fix handling of literal slashes in gitignore patterns.
|
||||||
|
* [BUG #1121](https://github.com/BurntSushi/ripgrep/issues/1121):
|
||||||
|
Fix bug that was triggering Windows antimalware when using the --files flag.
|
||||||
|
* [BUG #1125](https://github.com/BurntSushi/ripgrep/issues/1125),
|
||||||
|
[BUG #1159](https://github.com/BurntSushi/ripgrep/issues/1159):
|
||||||
|
ripgrep shouldn't panic for `rg -h | rg` and should emit correct exit status.
|
||||||
|
* [BUG #1154](https://github.com/BurntSushi/ripgrep/issues/1154):
|
||||||
|
Windows files with "hidden" attribute are now treated as hidden.
|
||||||
|
* [BUG #1173](https://github.com/BurntSushi/ripgrep/issues/1173):
|
||||||
|
Fix handling of `**` patterns in gitignore files.
|
||||||
|
* [BUG #1174](https://github.com/BurntSushi/ripgrep/issues/1174):
|
||||||
|
Fix handling of repeated `**` patterns in gitignore files.
|
||||||
|
* [BUG #1176](https://github.com/BurntSushi/ripgrep/issues/1176):
|
||||||
|
Fix bug where `-F`/`-x` weren't applied to patterns given via `-f`.
|
||||||
|
|
||||||
|
|
||||||
|
0.10.0 (2018-09-07)
|
||||||
|
===================
|
||||||
|
This is a new minor version release of ripgrep that contains some major new
|
||||||
|
features, a huge number of bug fixes, and is the first release based on
|
||||||
|
libripgrep. The entirety of ripgrep's core search and printing code has been
|
||||||
|
rewritten and generalized so that anyone can make use of it.
|
||||||
|
|
||||||
|
Major new features include PCRE2 support, multi-line search and a JSON output
|
||||||
|
format.
|
||||||
|
|
||||||
|
**BREAKING CHANGES**:
|
||||||
|
|
||||||
|
* The minimum version required to compile Rust has now changed to track the
|
||||||
|
latest stable version of Rust. Patch releases will continue to compile with
|
||||||
|
the same version of Rust as the previous patch release, but new minor
|
||||||
|
versions will use the current stable version of the Rust compile as its
|
||||||
|
minimum supported version.
|
||||||
|
* The match semantics of `-w/--word-regexp` have changed slightly. They used
|
||||||
|
to be `\b(?:<your pattern>)\b`, but now it's
|
||||||
|
`(?:^|\W)(?:<your pattern>)(?:$|\W)`. This matches the behavior of GNU grep
|
||||||
|
and is believed to be closer to the intended semantics of the flag. See
|
||||||
|
[#389](https://github.com/BurntSushi/ripgrep/issues/389) for more details.
|
||||||
|
|
||||||
|
Feature enhancements:
|
||||||
|
|
||||||
|
* [FEATURE #162](https://github.com/BurntSushi/ripgrep/issues/162):
|
||||||
|
libripgrep is now a thing. The primary crate is
|
||||||
|
[`grep`](https://docs.rs/grep).
|
||||||
|
* [FEATURE #176](https://github.com/BurntSushi/ripgrep/issues/176):
|
||||||
|
Add `-U/--multiline` flag that permits matching over multiple lines.
|
||||||
|
* [FEATURE #188](https://github.com/BurntSushi/ripgrep/issues/188):
|
||||||
|
Add `-P/--pcre2` flag that gives support for look-around and backreferences.
|
||||||
|
* [FEATURE #244](https://github.com/BurntSushi/ripgrep/issues/244):
|
||||||
|
Add `--json` flag that prints results in a JSON Lines format.
|
||||||
|
* [FEATURE #321](https://github.com/BurntSushi/ripgrep/issues/321):
|
||||||
|
Add `--one-file-system` flag to skip directories on different file systems.
|
||||||
|
* [FEATURE #404](https://github.com/BurntSushi/ripgrep/issues/404):
|
||||||
|
Add `--sort` and `--sortr` flag for more sorting. Deprecate `--sort-files`.
|
||||||
|
* [FEATURE #416](https://github.com/BurntSushi/ripgrep/issues/416):
|
||||||
|
Add `--crlf` flag to permit `$` to work with carriage returns on Windows.
|
||||||
|
* [FEATURE #917](https://github.com/BurntSushi/ripgrep/issues/917):
|
||||||
|
The `--trim` flag strips prefix whitespace from all lines printed.
|
||||||
|
* [FEATURE #993](https://github.com/BurntSushi/ripgrep/issues/993):
|
||||||
|
Add `--null-data` flag, which makes ripgrep use NUL as a line terminator.
|
||||||
|
* [FEATURE #997](https://github.com/BurntSushi/ripgrep/issues/997):
|
||||||
|
The `--passthru` flag now works with the `--replace` flag.
|
||||||
|
* [FEATURE #1038-1](https://github.com/BurntSushi/ripgrep/issues/1038):
|
||||||
|
Add `--line-buffered` and `--block-buffered` for forcing a buffer strategy.
|
||||||
|
* [FEATURE #1038-2](https://github.com/BurntSushi/ripgrep/issues/1038):
|
||||||
|
Add `--pre-glob` for filtering files through the `--pre` flag.
|
||||||
|
|
||||||
|
Bug fixes:
|
||||||
|
|
||||||
|
* [BUG #2](https://github.com/BurntSushi/ripgrep/issues/2):
|
||||||
|
Searching with non-zero context can now use memory maps if appropriate.
|
||||||
|
* [BUG #200](https://github.com/BurntSushi/ripgrep/issues/200):
|
||||||
|
ripgrep will now stop correctly when its output pipe is closed.
|
||||||
|
* [BUG #389](https://github.com/BurntSushi/ripgrep/issues/389):
|
||||||
|
The `-w/--word-regexp` flag now works more intuitively.
|
||||||
|
* [BUG #643](https://github.com/BurntSushi/ripgrep/issues/643):
|
||||||
|
Detection of readable stdin has improved on Windows.
|
||||||
|
* [BUG #441](https://github.com/BurntSushi/ripgrep/issues/441),
|
||||||
|
[BUG #690](https://github.com/BurntSushi/ripgrep/issues/690),
|
||||||
|
[BUG #980](https://github.com/BurntSushi/ripgrep/issues/980):
|
||||||
|
Matching empty lines now works correctly in several corner cases.
|
||||||
|
* [BUG #764](https://github.com/BurntSushi/ripgrep/issues/764):
|
||||||
|
Color escape sequences now coalesce, which reduces output size.
|
||||||
|
* [BUG #842](https://github.com/BurntSushi/ripgrep/issues/842):
|
||||||
|
Add man page to binary Debian package.
|
||||||
|
* [BUG #922](https://github.com/BurntSushi/ripgrep/issues/922):
|
||||||
|
ripgrep is now more robust with respect to memory maps failing.
|
||||||
|
* [BUG #937](https://github.com/BurntSushi/ripgrep/issues/937):
|
||||||
|
Color escape sequences are no longer emitted for empty matches.
|
||||||
|
* [BUG #940](https://github.com/BurntSushi/ripgrep/issues/940):
|
||||||
|
Context from the `--passthru` flag should not impact process exit status.
|
||||||
|
* [BUG #984](https://github.com/BurntSushi/ripgrep/issues/984):
|
||||||
|
Fixes bug in `ignore` crate where first path was always treated as a symlink.
|
||||||
|
* [BUG #990](https://github.com/BurntSushi/ripgrep/issues/990):
|
||||||
|
Read stderr asynchronously when running a process.
|
||||||
|
* [BUG #1013](https://github.com/BurntSushi/ripgrep/issues/1013):
|
||||||
|
Add compile time and runtime CPU features to `--version` output.
|
||||||
|
* [BUG #1028](https://github.com/BurntSushi/ripgrep/pull/1028):
|
||||||
|
Don't complete bare pattern after `-f` in zsh.
|
||||||
|
|
||||||
|
|
||||||
0.9.0 (2018-08-03)
|
0.9.0 (2018-08-03)
|
||||||
==================
|
==================
|
||||||
This is a new minor version release of ripgrep that contains some minor new
|
This is a new minor version release of ripgrep that contains some minor new
|
||||||
@@ -31,7 +178,7 @@ multi-line search support and a JSON output format.
|
|||||||
|
|
||||||
Feature enhancements:
|
Feature enhancements:
|
||||||
|
|
||||||
* Added or improved file type filtering for Android, Bazel, Fuschia, Haskell,
|
* Added or improved file type filtering for Android, Bazel, Fuchsia, Haskell,
|
||||||
Java and Puppet.
|
Java and Puppet.
|
||||||
* [FEATURE #411](https://github.com/BurntSushi/ripgrep/issues/411):
|
* [FEATURE #411](https://github.com/BurntSushi/ripgrep/issues/411):
|
||||||
Add a `--stats` flag, which emits aggregate statistics after search results.
|
Add a `--stats` flag, which emits aggregate statistics after search results.
|
||||||
|
|||||||
669
Cargo.lock
generated
669
Cargo.lock
generated
@@ -1,17 +1,11 @@
|
|||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aho-corasick"
|
name = "aho-corasick"
|
||||||
version = "0.6.6"
|
version = "0.6.9"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ansi_term"
|
|
||||||
version = "0.11.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
dependencies = [
|
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -19,27 +13,47 @@ name = "atty"
|
|||||||
version = "0.2.11"
|
version = "0.2.11"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "autocfg"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "base64"
|
||||||
|
version = "0.10.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bitflags"
|
name = "bitflags"
|
||||||
version = "1.0.3"
|
version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytecount"
|
name = "bytecount"
|
||||||
version = "0.3.1"
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "byteorder"
|
||||||
|
version = "1.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cc"
|
||||||
|
version = "1.0.29"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
|
||||||
"simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "0.1.4"
|
version = "0.1.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -47,34 +61,53 @@ name = "clap"
|
|||||||
version = "2.32.0"
|
version = "2.32.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "crossbeam"
|
name = "cloudabi"
|
||||||
version = "0.3.2"
|
version = "0.0.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crossbeam-channel"
|
||||||
|
version = "0.3.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"smallvec 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "crossbeam-utils"
|
||||||
|
version = "0.6.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "encoding_rs"
|
name = "encoding_rs"
|
||||||
version = "0.8.4"
|
version = "0.8.16"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "encoding_rs_io"
|
name = "encoding_rs_io"
|
||||||
version = "0.1.1"
|
version = "0.1.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"encoding_rs 0.8.16 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -83,17 +116,8 @@ version = "1.0.6"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fuchsia-zircon"
|
name = "fuchsia-cprng"
|
||||||
version = "0.3.3"
|
version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
dependencies = [
|
|
||||||
"bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "fuchsia-zircon-sys"
|
|
||||||
version = "0.3.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -103,99 +127,324 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "globset"
|
name = "globset"
|
||||||
version = "0.4.1"
|
version = "0.4.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "grep"
|
name = "grep"
|
||||||
version = "0.1.9"
|
version = "0.2.3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"grep-cli 0.1.1",
|
||||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"grep-matcher 0.1.1",
|
||||||
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"grep-pcre2 0.1.2",
|
||||||
"regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"grep-printer 0.1.1",
|
||||||
|
"grep-regex 0.1.2",
|
||||||
|
"grep-searcher 0.1.3",
|
||||||
|
"termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "grep-cli"
|
||||||
|
version = "0.1.1"
|
||||||
|
dependencies = [
|
||||||
|
"atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"globset 0.4.2",
|
||||||
|
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "grep-matcher"
|
||||||
|
version = "0.1.1"
|
||||||
|
dependencies = [
|
||||||
|
"memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "grep-pcre2"
|
||||||
|
version = "0.1.2"
|
||||||
|
dependencies = [
|
||||||
|
"grep-matcher 0.1.1",
|
||||||
|
"pcre2 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "grep-printer"
|
||||||
|
version = "0.1.1"
|
||||||
|
dependencies = [
|
||||||
|
"base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"grep-matcher 0.1.1",
|
||||||
|
"grep-regex 0.1.2",
|
||||||
|
"grep-searcher 0.1.3",
|
||||||
|
"serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"serde_derive 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "grep-regex"
|
||||||
|
version = "0.1.2"
|
||||||
|
dependencies = [
|
||||||
|
"grep-matcher 0.1.1",
|
||||||
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "grep-searcher"
|
||||||
|
version = "0.1.3"
|
||||||
|
dependencies = [
|
||||||
|
"bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"encoding_rs 0.8.16 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"encoding_rs_io 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"grep-matcher 0.1.1",
|
||||||
|
"grep-regex 0.1.2",
|
||||||
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ignore"
|
name = "ignore"
|
||||||
version = "0.4.3"
|
version = "0.4.6"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"globset 0.4.1",
|
"globset 0.4.2",
|
||||||
"lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
"tempfile 3.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"walkdir 2.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "0.4.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lazy_static"
|
name = "lazy_static"
|
||||||
version = "1.0.2"
|
version = "1.2.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.42"
|
version = "0.2.48"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "log"
|
name = "log"
|
||||||
version = "0.4.3"
|
version = "0.4.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cfg-if 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.0.1"
|
version = "2.1.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memmap"
|
name = "memmap"
|
||||||
version = "0.6.2"
|
version = "0.7.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num_cpus"
|
name = "num_cpus"
|
||||||
version = "1.8.0"
|
version = "1.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "packed_simd"
|
||||||
|
version = "0.3.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pcre2"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pcre2-sys 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pcre2-sys"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pkg-config"
|
||||||
|
version = "0.3.14"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "0.4.27"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "0.6.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rand"
|
name = "rand"
|
||||||
version = "0.4.2"
|
version = "0.6.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_jitter 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_os 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_chacha"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_core"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_core"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_hc"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_isaac"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_jitter"
|
||||||
|
version = "0.1.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_os"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_pcg"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_xorshift"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rdrand"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "redox_syscall"
|
name = "redox_syscall"
|
||||||
version = "0.1.40"
|
version = "0.1.51"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -203,27 +452,27 @@ name = "redox_termios"
|
|||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)",
|
"redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex"
|
name = "regex"
|
||||||
version = "1.0.2"
|
version = "1.1.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-syntax"
|
name = "regex-syntax"
|
||||||
version = "0.6.2"
|
version = "0.6.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -231,45 +480,92 @@ name = "remove_dir_all"
|
|||||||
version = "0.5.1"
|
version = "0.5.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ripgrep"
|
name = "ripgrep"
|
||||||
version = "0.9.0"
|
version = "0.10.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"bytecount 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"grep 0.2.3",
|
||||||
"encoding_rs_io 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"ignore 0.4.6",
|
||||||
"globset 0.4.1",
|
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"grep 0.1.9",
|
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"ignore 0.4.3",
|
"num_cpus 1.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_derive 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"termcolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustc_version"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ryu"
|
||||||
|
version = "0.2.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "same-file"
|
name = "same-file"
|
||||||
version = "1.0.2"
|
version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "simd"
|
name = "semver"
|
||||||
version = "0.2.2"
|
version = "0.9.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "semver-parser"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.87"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_derive"
|
||||||
|
version = "1.0.87"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_json"
|
||||||
|
version = "1.0.38"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smallvec"
|
||||||
|
version = "0.6.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
@@ -277,20 +573,34 @@ version = "0.7.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tempdir"
|
name = "syn"
|
||||||
version = "0.3.7"
|
version = "0.15.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tempfile"
|
||||||
|
version = "3.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "termcolor"
|
name = "termcolor"
|
||||||
version = "1.0.1"
|
version = "1.0.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"wincolor 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -298,8 +608,8 @@ name = "termion"
|
|||||||
version = "1.5.1"
|
version = "1.5.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)",
|
"redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -313,16 +623,15 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thread_local"
|
name = "thread_local"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ucd-util"
|
name = "ucd-util"
|
||||||
version = "0.1.1"
|
version = "0.1.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -330,6 +639,11 @@ name = "unicode-width"
|
|||||||
version = "0.1.5"
|
version = "0.1.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-xid"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unreachable"
|
name = "unreachable"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
@@ -340,7 +654,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "utf8-ranges"
|
name = "utf8-ranges"
|
||||||
version = "1.0.0"
|
version = "1.0.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -350,16 +664,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "walkdir"
|
name = "walkdir"
|
||||||
version = "2.1.4"
|
version = "2.2.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi"
|
name = "winapi"
|
||||||
version = "0.3.5"
|
version = "0.3.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@@ -371,6 +686,14 @@ name = "winapi-i686-pc-windows-gnu"
|
|||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi-x86_64-pc-windows-gnu"
|
name = "winapi-x86_64-pc-windows-gnu"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@@ -378,54 +701,86 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wincolor"
|
name = "wincolor"
|
||||||
version = "1.0.0"
|
version = "1.0.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
"winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
"checksum aho-corasick 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c1c6d463cbe7ed28720b5b489e7c083eeb8f90d08be2a0d6bb9e1ffea9ce1afa"
|
"checksum aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1e9a933f4e58658d7b12defcf96dc5c720f20832deebe3e0a19efd3b6aaeeb9e"
|
||||||
"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
|
|
||||||
"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652"
|
"checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652"
|
||||||
"checksum bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d0c54bb8f454c567f21197eefcdbf5679d0bd99f2ddbe52e84c77061952e6789"
|
"checksum autocfg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a6d640bee2da49f60a4068a7fae53acde8982514ab7bae8b8cea9e88cbcfd799"
|
||||||
"checksum bytecount 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "882585cd7ec84e902472df34a5e01891202db3bf62614e1f0afe459c1afcf744"
|
"checksum base64 0.10.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0b25d992356d2eb0ed82172f5248873db5560c4721f564b13cb5193bda5e668e"
|
||||||
"checksum cfg-if 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "efe5c877e17a9c717a0bf3613b2709f723202c4e4675cc8f12926ded29bcb17e"
|
"checksum bitflags 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "228047a76f468627ca71776ecdebd732a3423081fcf5125585bcd7c49886ce12"
|
||||||
|
"checksum bytecount 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "be0fdd54b507df8f22012890aadd099979befdba27713c767993f8380112ca7c"
|
||||||
|
"checksum byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a019b10a2a7cdeb292db131fc8113e57ea2a908f6e7894b0c3c671893b65dbeb"
|
||||||
|
"checksum cc 1.0.29 (registry+https://github.com/rust-lang/crates.io-index)" = "4390a3b5f4f6bce9c1d0c00128379df433e53777fdd30e92f16a529332baec4e"
|
||||||
|
"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4"
|
||||||
"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e"
|
"checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e"
|
||||||
"checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"
|
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||||
"checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac"
|
"checksum crossbeam-channel 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "0f0ed1a4de2235cabda8558ff5840bffb97fcb64c97827f354a451307df5f72b"
|
||||||
"checksum encoding_rs_io 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad0ffe753ba194ef1bc070e8d61edaadb1536c05e364fc9178ca6cbde10922c4"
|
"checksum crossbeam-utils 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f8306fcef4a7b563b76b7dd949ca48f52bc1141aa067d2ea09565f3e2652aa5c"
|
||||||
|
"checksum encoding_rs 0.8.16 (registry+https://github.com/rust-lang/crates.io-index)" = "0535f350c60aac0b87ccf28319abc749391e912192255b0c00a2c12c6917bd73"
|
||||||
|
"checksum encoding_rs_io 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "6c89a56158243c7cde22fde70e452a40dded9d9d9100f71273df19af9be4d034"
|
||||||
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
||||||
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
|
"checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
|
||||||
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
|
|
||||||
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
||||||
"checksum lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "fb497c35d362b6a331cfd94956a07fc2c78a4604cdbee844a81170386b996dd3"
|
"checksum itoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "1306f3464951f30e30d12373d31c79fbd52d236e5e896fd92f96ec7babbbe60b"
|
||||||
"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1"
|
"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1"
|
||||||
"checksum log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "61bd98ae7f7b754bc53dca7d44b604f733c6bba044ea6f41bc8d89272d8161d2"
|
"checksum libc 0.2.48 (registry+https://github.com/rust-lang/crates.io-index)" = "e962c7641008ac010fa60a7dfdc1712449f29c44ef2d4702394aea943ee75047"
|
||||||
"checksum memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "796fba70e76612589ed2ce7f45282f5af869e0fdd7cc6199fa1aa1f1d591ba9d"
|
"checksum log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c84ec4b527950aa83a329754b01dbe3f58361d1c5efacd1f6d68c494d08a17c6"
|
||||||
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
|
"checksum memchr 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e1dd4eaac298c32ce07eb6ed9242eda7d82955b9170b7d6db59b2e02cc63fcb8"
|
||||||
"checksum num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c51a3322e4bca9d212ad9a158a02abc6934d005490c054a2778df73a70aa0a30"
|
"checksum memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
|
||||||
"checksum rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "eba5f8cb59cc50ed56be8880a5c7b496bfd9bd26394e176bc67884094145c2c5"
|
"checksum num_cpus 1.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5a69d464bdc213aaaff628444e99578ede64e9c854025aa43b9796530afa9238"
|
||||||
"checksum redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "c214e91d3ecf43e9a4e41e578973adeb14b474f2bee858742d127af75a0112b1"
|
"checksum packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a85ea9fc0d4ac0deb6fe7911d38786b32fc11119afd9e9d38b84ff691ce64220"
|
||||||
|
"checksum pcre2 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3ae0a2682105ec5ca0ee5910bbc7e926386d348a05166348f74007942983c319"
|
||||||
|
"checksum pcre2-sys 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a9027f9474e4e13d3b965538aafcaebe48c803488ad76b3c97ef061a8324695f"
|
||||||
|
"checksum pkg-config 0.3.14 (registry+https://github.com/rust-lang/crates.io-index)" = "676e8eb2b1b4c9043511a9b7bea0915320d7e502b0a079fb03f9635a5252b18c"
|
||||||
|
"checksum proc-macro2 0.4.27 (registry+https://github.com/rust-lang/crates.io-index)" = "4d317f9caece796be1980837fd5cb3dfec5613ebdb04ad0956deea83ce168915"
|
||||||
|
"checksum quote 0.6.11 (registry+https://github.com/rust-lang/crates.io-index)" = "cdd8e04bd9c52e0342b406469d494fcb033be4bdbe5c606016defbb1681411e1"
|
||||||
|
"checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
|
||||||
|
"checksum rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef"
|
||||||
|
"checksum rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6fdeb83b075e8266dcc8762c22776f6877a63111121f5f8c7411e5be7eed4b"
|
||||||
|
"checksum rand_core 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d0e7a549d590831370895ab7ba4ea0c1b6b011d106b5ff2da6eee112615e6dc0"
|
||||||
|
"checksum rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4"
|
||||||
|
"checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
|
||||||
|
"checksum rand_jitter 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b9ea758282efe12823e0d952ddb269d2e1897227e464919a554f2a03ef1b832"
|
||||||
|
"checksum rand_os 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "b7c690732391ae0abafced5015ffb53656abfaec61b342290e5eb56b286a679d"
|
||||||
|
"checksum rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "086bd09a33c7044e56bb44d5bdde5a60e7f119a9e95b0775f545de759a32fe05"
|
||||||
|
"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
|
||||||
|
"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
|
||||||
|
"checksum redox_syscall 0.1.51 (registry+https://github.com/rust-lang/crates.io-index)" = "423e376fffca3dfa06c9e9790a9ccd282fafb3cc6e6397d01dbf64f9bacc6b85"
|
||||||
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
|
"checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76"
|
||||||
"checksum regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5bbbea44c5490a1e84357ff28b7d518b4619a159fed5d25f6c1de2d19cc42814"
|
"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f"
|
||||||
"checksum regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "747ba3b235651f6e2f67dfa8bcdcd073ddb7c243cb21c442fc12395dfcac212d"
|
"checksum regex-syntax 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "8c2f35eedad5295fdf00a63d7d4b238135723f92b434ec06774dad15c7ab0861"
|
||||||
"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
|
"checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5"
|
||||||
"checksum same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "cfb6eded0b06a0b512c8ddbcf04089138c9b4362c2f696f3c3d76039d68f3637"
|
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
|
||||||
"checksum simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "ed3686dd9418ebcc3a26a0c0ae56deab0681e53fe899af91f5bbcee667ebffb1"
|
"checksum ryu 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "eb9e9b8cde282a9fe6a42dd4681319bfb63f121b8a8ee9439c6f4107e58a46f7"
|
||||||
|
"checksum same-file 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8f20c4be53a8a1ff4c1f1b2bd14570d2f634628709752f0702ecdd2b3f9a5267"
|
||||||
|
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||||
|
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||||
|
"checksum serde 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)" = "2e20fde37801e83c891a2dc4ebd3b81f0da4d1fb67a9e0a2a3b921e2536a58ee"
|
||||||
|
"checksum serde_derive 1.0.87 (registry+https://github.com/rust-lang/crates.io-index)" = "633e97856567e518b59ffb2ad7c7a4fd4c5d91d9c7f32dd38a27b2bf7e8114ea"
|
||||||
|
"checksum serde_json 1.0.38 (registry+https://github.com/rust-lang/crates.io-index)" = "27dce848e7467aa0e2fcaf0a413641499c0b745452aaca1194d24dedde9e13c9"
|
||||||
|
"checksum smallvec 0.6.8 (registry+https://github.com/rust-lang/crates.io-index)" = "88aea073965ab29f6edb5493faf96ad662fb18aa9eeb186a3b7057951605ed15"
|
||||||
"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550"
|
"checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550"
|
||||||
"checksum tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8"
|
"checksum syn 0.15.26 (registry+https://github.com/rust-lang/crates.io-index)" = "f92e629aa1d9c827b2bb8297046c1ccffc57c99b947a680d3ccff1f136a3bee9"
|
||||||
"checksum termcolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "722426c4a0539da2c4ffd9b419d90ad540b4cff4a053be9069c908d4d07e2836"
|
"checksum tempfile 3.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "37daa55a7240c4931c84559f03b3cad7d19535840d1c4a0cc4e9b2fb0dcf70ff"
|
||||||
|
"checksum termcolor 1.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4096add70612622289f2fdcdbd5086dc81c1e2675e6ae58d6c4f62a16c6d7f2f"
|
||||||
"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096"
|
"checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096"
|
||||||
"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6"
|
"checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6"
|
||||||
"checksum thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279ef31c19ededf577bfd12dfae728040a21f635b06a24cd670ff510edd38963"
|
"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
|
||||||
"checksum ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "fd2be2d6639d0f8fe6cdda291ad456e23629558d466e2789d2c3e9892bda285d"
|
"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86"
|
||||||
"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526"
|
"checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526"
|
||||||
|
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||||
"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56"
|
"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56"
|
||||||
"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122"
|
"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737"
|
||||||
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
|
"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
|
||||||
"checksum walkdir 2.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "63636bd0eb3d00ccb8b9036381b526efac53caf112b7783b730ab3f8e44da369"
|
"checksum walkdir 2.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "9d9d7ed3431229a144296213105a390676cc49c9b6a72bd19f3176c98e129fa1"
|
||||||
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
|
"checksum winapi 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "92c1eb33641e276cfa214a0522acad57be5c56b10cb348b3c5117db75f3ac4b0"
|
||||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
|
"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9"
|
||||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
"checksum wincolor 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b9dc3aa9dcda98b5a16150c54619c1ead22e3d3a5d458778ae914be760aa981a"
|
"checksum wincolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "561ed901ae465d6185fa7864d63fbd5720d0ef718366c9a4dc83cf6170d7e9ba"
|
||||||
|
|||||||
95
Cargo.toml
95
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "ripgrep"
|
name = "ripgrep"
|
||||||
version = "0.9.0" #:version
|
version = "0.10.0" #:version
|
||||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
description = """
|
description = """
|
||||||
ripgrep is a line-oriented search tool that recursively searches your current
|
ripgrep is a line-oriented search tool that recursively searches your current
|
||||||
@@ -17,6 +17,7 @@ license = "Unlicense OR MIT"
|
|||||||
exclude = ["HomebrewFormula"]
|
exclude = ["HomebrewFormula"]
|
||||||
build = "build.rs"
|
build = "build.rs"
|
||||||
autotests = false
|
autotests = false
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
[badges]
|
[badges]
|
||||||
travis-ci = { repository = "BurntSushi/ripgrep" }
|
travis-ci = { repository = "BurntSushi/ripgrep" }
|
||||||
@@ -32,51 +33,75 @@ name = "integration"
|
|||||||
path = "tests/tests.rs"
|
path = "tests/tests.rs"
|
||||||
|
|
||||||
[workspace]
|
[workspace]
|
||||||
members = ["grep", "globset", "ignore"]
|
members = [
|
||||||
|
"globset",
|
||||||
|
"grep",
|
||||||
|
"grep-cli",
|
||||||
|
"grep-matcher",
|
||||||
|
"grep-pcre2",
|
||||||
|
"grep-printer",
|
||||||
|
"grep-regex",
|
||||||
|
"grep-searcher",
|
||||||
|
"ignore",
|
||||||
|
]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
atty = "0.2.11"
|
grep = { version = "0.2.3", path = "grep" }
|
||||||
bytecount = "0.3.1"
|
ignore = { version = "0.4.4", path = "ignore" }
|
||||||
encoding_rs = "0.8"
|
lazy_static = "1.1.0"
|
||||||
encoding_rs_io = "0.1"
|
log = "0.4.5"
|
||||||
globset = { version = "0.4.0", path = "globset" }
|
num_cpus = "1.8.0"
|
||||||
grep = { version = "0.1.8", path = "grep" }
|
regex = "1.0.5"
|
||||||
ignore = { version = "0.4.0", path = "ignore" }
|
serde_json = "1.0.23"
|
||||||
lazy_static = "1"
|
termcolor = "1.0.3"
|
||||||
libc = "0.2"
|
|
||||||
log = "0.4"
|
|
||||||
memchr = "2"
|
|
||||||
memmap = "0.6"
|
|
||||||
num_cpus = "1"
|
|
||||||
regex = "1"
|
|
||||||
same-file = "1"
|
|
||||||
termcolor = "1"
|
|
||||||
|
|
||||||
[dependencies.clap]
|
[dependencies.clap]
|
||||||
version = "2.29.4"
|
version = "2.32.0"
|
||||||
default-features = false
|
default-features = false
|
||||||
features = ["suggestions", "color"]
|
features = ["suggestions"]
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies.winapi]
|
|
||||||
version = "0.3"
|
|
||||||
features = ["std", "winnt"]
|
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
lazy_static = "1"
|
lazy_static = "1.1.0"
|
||||||
|
|
||||||
[build-dependencies.clap]
|
[build-dependencies.clap]
|
||||||
version = "2.29.4"
|
version = "2.32.0"
|
||||||
default-features = false
|
default-features = false
|
||||||
features = ["suggestions", "color"]
|
features = ["suggestions"]
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
serde = "1.0.77"
|
||||||
|
serde_derive = "1.0.77"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
avx-accel = [
|
simd-accel = ["grep/simd-accel"]
|
||||||
"bytecount/avx-accel",
|
pcre2 = ["grep/pcre2"]
|
||||||
]
|
|
||||||
simd-accel = [
|
|
||||||
"bytecount/simd-accel",
|
|
||||||
"encoding_rs/simd-accel",
|
|
||||||
]
|
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
debug = true
|
debug = 1
|
||||||
|
|
||||||
|
[package.metadata.deb]
|
||||||
|
features = ["pcre2"]
|
||||||
|
section = "utils"
|
||||||
|
assets = [
|
||||||
|
["target/release/rg", "usr/bin/", "755"],
|
||||||
|
["COPYING", "usr/share/doc/ripgrep/", "644"],
|
||||||
|
["LICENSE-MIT", "usr/share/doc/ripgrep/", "644"],
|
||||||
|
["UNLICENSE", "usr/share/doc/ripgrep/", "644"],
|
||||||
|
["CHANGELOG.md", "usr/share/doc/ripgrep/CHANGELOG", "644"],
|
||||||
|
["README.md", "usr/share/doc/ripgrep/README", "644"],
|
||||||
|
["FAQ.md", "usr/share/doc/ripgrep/FAQ", "644"],
|
||||||
|
# The man page is automatically generated by ripgrep's build process, so
|
||||||
|
# this file isn't actually commited. Instead, to create a dpkg, either
|
||||||
|
# create a deployment/deb directory and copy the man page to it, or use the
|
||||||
|
# 'ci/build_deb.sh' script.
|
||||||
|
["deployment/deb/rg.1", "usr/share/man/man1/rg.1", "644"],
|
||||||
|
# Similarly for shell completions.
|
||||||
|
["deployment/deb/rg.bash", "usr/share/bash-completion/completions/rg", "644"],
|
||||||
|
["deployment/deb/rg.fish", "usr/share/fish/completions/rg.fish", "644"],
|
||||||
|
["deployment/deb/_rg", "usr/share/zsh/vendor-completions/", "644"],
|
||||||
|
]
|
||||||
|
extended-description = """\
|
||||||
|
ripgrep (rg) recursively searches your current directory for a regex pattern.
|
||||||
|
By default, ripgrep will respect your .gitignore and automatically skip hidden
|
||||||
|
files/directories and binary files.
|
||||||
|
"""
|
||||||
|
|||||||
347
FAQ.md
347
FAQ.md
@@ -16,6 +16,7 @@
|
|||||||
* [How do I get around the regex size limit?](#size-limit)
|
* [How do I get around the regex size limit?](#size-limit)
|
||||||
* [How do I make the `-f/--file` flag faster?](#dfa-size)
|
* [How do I make the `-f/--file` flag faster?](#dfa-size)
|
||||||
* [How do I make the output look like The Silver Searcher's output?](#silver-searcher-output)
|
* [How do I make the output look like The Silver Searcher's output?](#silver-searcher-output)
|
||||||
|
* [Why does ripgrep get slower when I enabled PCRE2 regexes?](#pcre2-slow)
|
||||||
* [When I run `rg`, why does it execute some other command?](#rg-other-cmd)
|
* [When I run `rg`, why does it execute some other command?](#rg-other-cmd)
|
||||||
* [How do I create an alias for ripgrep on Windows?](#rg-alias-windows)
|
* [How do I create an alias for ripgrep on Windows?](#rg-alias-windows)
|
||||||
* [How do I create a PowerShell profile?](#powershell-profile)
|
* [How do I create a PowerShell profile?](#powershell-profile)
|
||||||
@@ -117,7 +118,7 @@ from run to run of ripgrep.
|
|||||||
The only way to make the order of results consistent is to ask ripgrep to
|
The only way to make the order of results consistent is to ask ripgrep to
|
||||||
sort the output. Currently, this will disable all parallelism. (On smaller
|
sort the output. Currently, this will disable all parallelism. (On smaller
|
||||||
repositories, you might not notice much of a performance difference!) You
|
repositories, you might not notice much of a performance difference!) You
|
||||||
can achieve this with the `--sort-files` flag.
|
can achieve this with the `--sort path` flag.
|
||||||
|
|
||||||
There is more discussion on this topic here:
|
There is more discussion on this topic here:
|
||||||
https://github.com/BurntSushi/ripgrep/issues/152
|
https://github.com/BurntSushi/ripgrep/issues/152
|
||||||
@@ -135,10 +136,10 @@ How do I search compressed files?
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
ripgrep's `-z/--search-zip` flag will cause it to search compressed files
|
ripgrep's `-z/--search-zip` flag will cause it to search compressed files
|
||||||
automatically. Currently, this supports gzip, bzip2, lzma, lz4 and xz only and
|
automatically. Currently, this supports gzip, bzip2, xz, lzma, lz4, Brotli and
|
||||||
requires the corresponding `gzip`, `bzip2` and `xz` binaries to be installed on
|
Zstd. Each of these requires requires the corresponding `gzip`, `bzip2`, `xz`,
|
||||||
your system. (That is, ripgrep does decompression by shelling out to another
|
`lz4`, `brotli` and `zstd` binaries to be installed on your system. (That is,
|
||||||
process.)
|
ripgrep does decompression by shelling out to another process.)
|
||||||
|
|
||||||
ripgrep currently does not search archive formats, so `*.tar.gz` files, for
|
ripgrep currently does not search archive formats, so `*.tar.gz` files, for
|
||||||
example, are skipped.
|
example, are skipped.
|
||||||
@@ -148,22 +149,45 @@ example, are skipped.
|
|||||||
How do I search over multiple lines?
|
How do I search over multiple lines?
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
This isn't currently possible. ripgrep is fundamentally a line-oriented search
|
The `-U/--multiline` flag enables ripgrep to report results that span over
|
||||||
tool. With that said,
|
multiple lines.
|
||||||
[multiline search is a planned opt-in feature](https://github.com/BurntSushi/ripgrep/issues/176).
|
|
||||||
|
|
||||||
|
|
||||||
<h3 name="fancy">
|
<h3 name="fancy">
|
||||||
How do I use lookaround and/or backreferences?
|
How do I use lookaround and/or backreferences?
|
||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
This isn't currently possible. ripgrep uses finite automata to implement
|
ripgrep's default regex engine does not support lookaround or backreferences.
|
||||||
regular expression search, and in turn, guarantees linear time searching on all
|
This is primarily because the default regex engine is implemented using finite
|
||||||
inputs. It is difficult to efficiently support lookaround and backreferences in
|
state machines in order to guarantee a linear worst case time complexity on all
|
||||||
finite automata engines, so ripgrep does not provide these features.
|
inputs. Backreferences are not possible to implement in this paradigm, and
|
||||||
|
lookaround appears difficult to do efficiently.
|
||||||
|
|
||||||
If a production quality regular expression engine with these features is ever
|
However, ripgrep optionally supports using PCRE2 as the regex engine instead of
|
||||||
written in Rust, then it is possible ripgrep will provide it as an opt-in
|
the default one based on finite state machines. You can enable PCRE2 with the
|
||||||
|
`-P/--pcre2` flag. For example, in the root of the ripgrep repo, you can easily
|
||||||
|
find all palindromes:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rg -P '(\w{10})\1'
|
||||||
|
tests/misc.rs
|
||||||
|
483: cmd.arg("--max-filesize").arg("44444444444444444444");
|
||||||
|
globset/src/glob.rs
|
||||||
|
1206: matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
|
||||||
|
```
|
||||||
|
|
||||||
|
If your version of ripgrep doesn't support PCRE2, then you'll get an error
|
||||||
|
message when you try to use the `-P/--pcre2` flag:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rg -P '(\w{10})\1'
|
||||||
|
PCRE2 is not available in this build of ripgrep
|
||||||
|
```
|
||||||
|
|
||||||
|
Most of the releases distributed by the ripgrep project here on GitHub will
|
||||||
|
come bundled with PCRE2 enabled. If you installed ripgrep through a different
|
||||||
|
means (like your system's package manager), then please reach out to the
|
||||||
|
maintainer of that package to see whether it's possible to enable the PCRE2
|
||||||
feature.
|
feature.
|
||||||
|
|
||||||
|
|
||||||
@@ -368,6 +392,301 @@ $ RIPGREP_CONFIG_PATH=$HOME/.config/ripgrep/rc rg foo
|
|||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
<h3 name="pcre2-slow">
|
||||||
|
Why does ripgrep get slower when I enable PCRE2 regexes?
|
||||||
|
</h3>
|
||||||
|
|
||||||
|
When you use the `--pcre2` (`-P` for short) flag, ripgrep will use the PCRE2
|
||||||
|
regex engine instead of the default. Both regex engines are quite fast,
|
||||||
|
but PCRE2 provides a number of additional features such as look-around and
|
||||||
|
backreferences that many enjoy using. This is largely because PCRE2 uses
|
||||||
|
a backtracking implementation where as the default regex engine uses a finite
|
||||||
|
automaton based implementation. The former provides the ability to add lots of
|
||||||
|
bells and whistles over the latter, but the latter executes with worst case
|
||||||
|
linear time complexity.
|
||||||
|
|
||||||
|
With that out of the way, if you've used `-P` with ripgrep, you may have
|
||||||
|
noticed that it can be slower. The reasons for why this is are quite complex,
|
||||||
|
and they are complex because the optimizations that ripgrep uses to implement
|
||||||
|
fast search are complex.
|
||||||
|
|
||||||
|
The task ripgrep has before it is somewhat simple; all it needs to do is search
|
||||||
|
a file for occurrences of some pattern and then print the lines containing
|
||||||
|
those occurrences. The problem lies in what is considered a valid match and how
|
||||||
|
exactly we read the bytes from a file.
|
||||||
|
|
||||||
|
In terms of what is considered a valid match, remember that ripgrep will only
|
||||||
|
report matches spanning a single line by default. The problem here is that
|
||||||
|
some patterns can match across multiple lines, and ripgrep needs to prevent
|
||||||
|
that from happening. For example, `foo\sbar` will match `foo\nbar`. The most
|
||||||
|
obvious way to achieve this is to read the data from a file, and then apply
|
||||||
|
the pattern search to that data for each line. The problem with this approach
|
||||||
|
is that it can be quite slow; it would be much faster to let the pattern
|
||||||
|
search across as much data as possible. It's faster because it gets rid of the
|
||||||
|
overhead of finding the boundaries of every line, and also because it gets rid
|
||||||
|
of the overhead of starting and stopping the pattern search for every single
|
||||||
|
line. (This is operating under the general assumption that matching lines are
|
||||||
|
much rarer than non-matching lines.)
|
||||||
|
|
||||||
|
It turns out that we can use the faster approach by applying a very simple
|
||||||
|
restriction to the pattern: *statically prevent* the pattern from matching
|
||||||
|
through a `\n` character. Namely, when given a pattern like `foo\sbar`,
|
||||||
|
ripgrep will remove `\n` from the `\s` character class automatically. In some
|
||||||
|
cases, a simple removal is not so easy. For example, ripgrep will return an
|
||||||
|
error when your pattern includes a `\n` literal:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rg '\n'
|
||||||
|
the literal '"\n"' is not allowed in a regex
|
||||||
|
```
|
||||||
|
|
||||||
|
So what does this have to do with PCRE2? Well, ripgrep's default regex engine
|
||||||
|
exposes APIs for doing syntactic analysis on the pattern in a way that makes
|
||||||
|
it quite easy to strip `\n` from the pattern (or otherwise detect it and report
|
||||||
|
an error if stripping isn't possible). PCRE2 seemingly does not provide a
|
||||||
|
similar API, so ripgrep does not do any stripping when PCRE2 is enabled. This
|
||||||
|
forces ripgrep to use the "slow" search strategy of searching each line
|
||||||
|
individually.
|
||||||
|
|
||||||
|
OK, so if enabling PCRE2 slows down the default method of searching because it
|
||||||
|
forces matches to be limited to a single line, then why is PCRE2 also sometimes
|
||||||
|
slower when performing multiline searches? Well, that's because there are
|
||||||
|
*multiple* reasons why using PCRE2 in ripgrep can be slower than the default
|
||||||
|
regex engine. This time, blame PCRE2's Unicode support, which ripgrep enables
|
||||||
|
by default. In particular, PCRE2 cannot simultaneously enable Unicode support
|
||||||
|
and search arbitrary data. That is, when PCRE2's Unicode support is enabled,
|
||||||
|
the data **must** be valid UTF-8 (to do otherwise is to invoke undefined
|
||||||
|
behavior). This is in contrast to ripgrep's default regex engine, which can
|
||||||
|
enable Unicode support and still search arbitrary data. ripgrep's default
|
||||||
|
regex engine simply won't match invalid UTF-8 for a pattern that can otherwise
|
||||||
|
only match valid UTF-8. Why doesn't PCRE2 do the same? This author isn't
|
||||||
|
familiar with its internals, so we can't comment on it here.
|
||||||
|
|
||||||
|
The bottom line here is that we can't enable PCRE2's Unicode support without
|
||||||
|
simultaneously incurring a performance penalty for ensuring that we are
|
||||||
|
searching valid UTF-8. In particular, ripgrep will transcode the contents
|
||||||
|
of each file to UTF-8 while replacing invalid UTF-8 data with the Unicode
|
||||||
|
replacement codepoint. ripgrep then disables PCRE2's own internal UTF-8
|
||||||
|
checking, since we've guaranteed the data we hand it will be valid UTF-8. The
|
||||||
|
reason why ripgrep takes this approach is because if we do hand PCRE2 invalid
|
||||||
|
UTF-8, then it will report a match error if it comes across an invalid UTF-8
|
||||||
|
sequence. This is not good news for ripgrep, since it will stop it from
|
||||||
|
searching the rest of the file, and will also print potentially undesirable
|
||||||
|
error messages to users.
|
||||||
|
|
||||||
|
All right, the above is a lot of information to swallow if you aren't already
|
||||||
|
familiar with ripgrep internals. Let's make this concrete with some examples.
|
||||||
|
First, let's get some data big enough to magnify the performance differences:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ curl -O 'https://burntsushi.net/stuff/subtitles2016-sample.gz'
|
||||||
|
$ gzip -d subtitles2016-sample
|
||||||
|
$ md5sum subtitles2016-sample
|
||||||
|
e3cb796a20bbc602fbfd6bb43bda45f5 subtitles2016-sample
|
||||||
|
```
|
||||||
|
|
||||||
|
To search this data, we will use the pattern `^\w{42}$`, which contains exactly
|
||||||
|
one hit in the file and has no literals. Having no literals is important,
|
||||||
|
because it ensures that the regex engine won't use literal optimizations to
|
||||||
|
speed up the search. In other words, it lets us reason coherently about the
|
||||||
|
actual task that the regex engine is performing.
|
||||||
|
|
||||||
|
Let's now walk through a few examples in light of the information above. First,
|
||||||
|
let's consider the default search using ripgrep's default regex engine and
|
||||||
|
then the same search with PCRE2:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ time rg '^\w{42}$' subtitles2016-sample
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m1.783s
|
||||||
|
user 0m1.731s
|
||||||
|
sys 0m0.051s
|
||||||
|
|
||||||
|
$ time rg -P '^\w{42}$' subtitles2016-sample
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m2.458s
|
||||||
|
user 0m2.419s
|
||||||
|
sys 0m0.038s
|
||||||
|
```
|
||||||
|
|
||||||
|
In this particular example, both pattern searches are using a Unicode aware
|
||||||
|
`\w` character class and both are counting lines in order to report line
|
||||||
|
numbers. The key difference here is that the first search will not search
|
||||||
|
line by line, but the second one will. We can observe which strategy ripgrep
|
||||||
|
uses by passing the `--trace` flag:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rg '^\w{42}$' subtitles2016-sample --trace
|
||||||
|
[... snip ...]
|
||||||
|
TRACE|grep_searcher::searcher|grep-searcher/src/searcher/mod.rs:622: Some("subtitles2016-sample"): searching via memory map
|
||||||
|
TRACE|grep_searcher::searcher|grep-searcher/src/searcher/mod.rs:712: slice reader: searching via slice-by-line strategy
|
||||||
|
TRACE|grep_searcher::searcher::core|grep-searcher/src/searcher/core.rs:61: searcher core: will use fast line searcher
|
||||||
|
[... snip ...]
|
||||||
|
|
||||||
|
$ rg -P '^\w{42}$' subtitles2016-sample --trace
|
||||||
|
[... snip ...]
|
||||||
|
TRACE|grep_searcher::searcher|grep-searcher/src/searcher/mod.rs:622: Some("subtitles2016-sample"): searching via memory map
|
||||||
|
TRACE|grep_searcher::searcher|grep-searcher/src/searcher/mod.rs:705: slice reader: needs transcoding, using generic reader
|
||||||
|
TRACE|grep_searcher::searcher|grep-searcher/src/searcher/mod.rs:685: generic reader: searching via roll buffer strategy
|
||||||
|
TRACE|grep_searcher::searcher::core|grep-searcher/src/searcher/core.rs:63: searcher core: will use slow line searcher
|
||||||
|
[... snip ...]
|
||||||
|
```
|
||||||
|
|
||||||
|
The first says it is using the "fast line searcher" where as the latter says
|
||||||
|
it is using the "slow line searcher." The latter also shows that we are
|
||||||
|
decoding the contents of the file, which also impacts performance.
|
||||||
|
|
||||||
|
Interestingly, in this case, the pattern does not match a `\n` and the file
|
||||||
|
we're searching is valid UTF-8, so neither the slow line-by-line search
|
||||||
|
strategy nor the decoding are necessary. We could fix the former issue with
|
||||||
|
better PCRE2 introspection APIs. We can actually fix the latter issue with
|
||||||
|
ripgrep's `--no-encoding` flag, which prevents the automatic UTF-8 decoding,
|
||||||
|
but will enable PCRE2's own UTF-8 validity checking. Unfortunately, it's slower
|
||||||
|
in my build of ripgrep:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ time rg -P '^\w{42}$' subtitles2016-sample --no-encoding
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m3.074s
|
||||||
|
user 0m3.021s
|
||||||
|
sys 0m0.051s
|
||||||
|
```
|
||||||
|
|
||||||
|
(Tip: use the `--trace` flag to verify that no decoding in ripgrep is
|
||||||
|
happening.)
|
||||||
|
|
||||||
|
A possible reason why PCRE2's UTF-8 checking is slower is because it might
|
||||||
|
not be better than the highly optimized UTF-8 checking routines found in the
|
||||||
|
[`encoding_rs`](https://github.com/hsivonen/encoding_rs) library, which is what
|
||||||
|
ripgrep uses for UTF-8 decoding. Moreover, my build of ripgrep enables
|
||||||
|
`encoding_rs`'s SIMD optimizations, which may be in play here.
|
||||||
|
|
||||||
|
Also, note that using the `--no-encoding` flag can cause PCRE2 to report
|
||||||
|
invalid UTF-8 errors, which causes ripgrep to stop searching the file:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ cat invalid-utf8
|
||||||
|
foobar
|
||||||
|
|
||||||
|
$ xxd invalid-utf8
|
||||||
|
00000000: 666f 6fff 6261 720a foo.bar.
|
||||||
|
|
||||||
|
$ rg foo invalid-utf8
|
||||||
|
1:foobar
|
||||||
|
|
||||||
|
$ rg -P foo invalid-utf8
|
||||||
|
1:foo<6F>bar
|
||||||
|
|
||||||
|
$ rg -P foo invalid-utf8 --no-encoding
|
||||||
|
invalid-utf8: PCRE2: error matching: UTF-8 error: illegal byte (0xfe or 0xff)
|
||||||
|
```
|
||||||
|
|
||||||
|
All right, so at this point, you might think that we could remove the penalty
|
||||||
|
for line-by-line searching by enabling multiline search. After all, our
|
||||||
|
particular pattern can't match across multiple lines anyway, so we'll still get
|
||||||
|
the results we want. Let's try it:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ time rg -U '^\w{42}$' subtitles2016-sample
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m1.803s
|
||||||
|
user 0m1.748s
|
||||||
|
sys 0m0.054s
|
||||||
|
|
||||||
|
$ time rg -P -U '^\w{42}$' subtitles2016-sample
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m2.962s
|
||||||
|
user 0m2.246s
|
||||||
|
sys 0m0.713s
|
||||||
|
```
|
||||||
|
|
||||||
|
Search times remain the same with the default regex engine, but the PCRE2
|
||||||
|
search gets _slower_. What happened? The secrets can be revealed with the
|
||||||
|
`--trace` flag once again. In the former case, ripgrep actually detects that
|
||||||
|
the pattern can't match across multiple lines, and so will fall back to the
|
||||||
|
"fast line search" strategy as with our search without `-U`.
|
||||||
|
|
||||||
|
However, for PCRE2, things are much worse. Namely, since Unicode mode is still
|
||||||
|
enabled, ripgrep is still going to decode UTF-8 to ensure that it hands only
|
||||||
|
valid UTF-8 to PCRE2. Unfortunately, one key downside of multiline search is
|
||||||
|
that ripgrep cannot do it incrementally. Since matches can be arbitrarily long,
|
||||||
|
ripgrep actually needs the entire file in memory at once. Normally, we can use
|
||||||
|
a memory map for this, but because we need to UTF-8 decode the file before
|
||||||
|
searching it, ripgrep winds up reading the entire contents of the file on to
|
||||||
|
the heap before executing a search. Owch.
|
||||||
|
|
||||||
|
OK, so Unicode is killing us here. The file we're searching is _mostly_ ASCII,
|
||||||
|
so maybe we're OK with missing some data. (Try `rg '[\w--\p{ascii}]'` to see
|
||||||
|
non-ASCII word characters that an ASCII-only `\w` character class would miss.)
|
||||||
|
We can disable Unicode in both searches, but this is done differently depending
|
||||||
|
on the regex engine we use:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ time rg '(?-u)^\w{42}$' subtitles2016-sample
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m1.714s
|
||||||
|
user 0m1.669s
|
||||||
|
sys 0m0.044s
|
||||||
|
|
||||||
|
$ time rg -P '^\w{42}$' subtitles2016-sample --no-pcre2-unicode
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m1.997s
|
||||||
|
user 0m1.958s
|
||||||
|
sys 0m0.037s
|
||||||
|
```
|
||||||
|
|
||||||
|
For the most part, ripgrep's default regex engine performs about the same.
|
||||||
|
PCRE2 does improve a little bit, and is now almost as fast as the default
|
||||||
|
regex engine. If you look at the output of `--trace`, you'll see that ripgrep
|
||||||
|
will no longer perform UTF-8 decoding, but it does still use the slow
|
||||||
|
line-by-line searcher.
|
||||||
|
|
||||||
|
At this point, we can combine all of our insights above: let's try to get off
|
||||||
|
of the slow line-by-line searcher by enabling multiline mode, and let's stop
|
||||||
|
UTF-8 decoding by disabling Unicode support:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ time rg -U '(?-u)^\w{42}$' subtitles2016-sample
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m1.714s
|
||||||
|
user 0m1.655s
|
||||||
|
sys 0m0.058s
|
||||||
|
|
||||||
|
$ time rg -P -U '^\w{42}$' subtitles2016-sample --no-pcre2-unicode
|
||||||
|
21225780:EverymajordevelopmentinthehistoryofAmerica
|
||||||
|
|
||||||
|
real 0m1.121s
|
||||||
|
user 0m1.071s
|
||||||
|
sys 0m0.048s
|
||||||
|
```
|
||||||
|
|
||||||
|
Ah, there's PCRE2's JIT shining! ripgrep's default regex engine once again
|
||||||
|
remains about the same, but PCRE2 no longer needs to search line-by-line and it
|
||||||
|
no longer needs to do any kind of UTF-8 checks. This allows the file to get
|
||||||
|
memory mapped and passed right through PCRE2's JIT at impressive speeds. (As
|
||||||
|
a brief and interesting historical note, the configuration of "memory map +
|
||||||
|
multiline + no-Unicode" is exactly the configuration used by The Silver
|
||||||
|
Searcher. This analysis perhaps sheds some reasoning as to why that
|
||||||
|
configuration is useful!)
|
||||||
|
|
||||||
|
In summary, if you want PCRE2 to go as fast as possible and you don't care
|
||||||
|
about Unicode and you don't care about matches possibly spanning across
|
||||||
|
multiple lines, then enable multiline mode with `-U` and disable PCRE2's
|
||||||
|
Unicode support with the `--no-pcre2-unicode` flag.
|
||||||
|
|
||||||
|
Caveat emptor: This author is not a PCRE2 expert, so there may be APIs that can
|
||||||
|
improve performance that the author missed. Similarly, there may be alternative
|
||||||
|
designs for a searching tool that are more amenable to how PCRE2 works.
|
||||||
|
|
||||||
|
|
||||||
<h3 name="rg-other-cmd">
|
<h3 name="rg-other-cmd">
|
||||||
When I run <code>rg</code>, why does it execute some other command?
|
When I run <code>rg</code>, why does it execute some other command?
|
||||||
</h3>
|
</h3>
|
||||||
|
|||||||
18
GUIDE.md
18
GUIDE.md
@@ -227,7 +227,7 @@ with the following contents:
|
|||||||
```
|
```
|
||||||
|
|
||||||
ripgrep treats `.ignore` files with higher precedence than `.gitignore` files
|
ripgrep treats `.ignore` files with higher precedence than `.gitignore` files
|
||||||
(and treats `.rgignore` files with higher precdence than `.ignore` files).
|
(and treats `.rgignore` files with higher precedence than `.ignore` files).
|
||||||
This means ripgrep will see the `!log/` whitelist rule first and search that
|
This means ripgrep will see the `!log/` whitelist rule first and search that
|
||||||
directory.
|
directory.
|
||||||
|
|
||||||
@@ -235,6 +235,11 @@ Like `.gitignore`, a `.ignore` file can be placed in any directory. Its rules
|
|||||||
will be processed with respect to the directory it resides in, just like
|
will be processed with respect to the directory it resides in, just like
|
||||||
`.gitignore`.
|
`.gitignore`.
|
||||||
|
|
||||||
|
To process `.gitignore` and `.ignore` files case insensitively, use the flag
|
||||||
|
`--ignore-file-case-insensitive`. This is especially useful on case insensitive
|
||||||
|
file systems like those on Windows and macOS. Note though that this can come
|
||||||
|
with a significant performance penalty, and is therefore disabled by default.
|
||||||
|
|
||||||
For a more in depth description of how glob patterns in a `.gitignore` file
|
For a more in depth description of how glob patterns in a `.gitignore` file
|
||||||
are interpreted, please see `man gitignore`.
|
are interpreted, please see `man gitignore`.
|
||||||
|
|
||||||
@@ -580,7 +585,7 @@ override it.
|
|||||||
|
|
||||||
If you're confused about what configuration file ripgrep is reading arguments
|
If you're confused about what configuration file ripgrep is reading arguments
|
||||||
from, then running ripgrep with the `--debug` flag should help clarify things.
|
from, then running ripgrep with the `--debug` flag should help clarify things.
|
||||||
The debug output should note what config file is being loaded and the arugments
|
The debug output should note what config file is being loaded and the arguments
|
||||||
that have been read from the configuration.
|
that have been read from the configuration.
|
||||||
|
|
||||||
Finally, if you want to make absolutely sure that ripgrep *isn't* reading a
|
Finally, if you want to make absolutely sure that ripgrep *isn't* reading a
|
||||||
@@ -604,7 +609,8 @@ topic, but we can try to summarize its relevancy to ripgrep:
|
|||||||
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
|
the most popular encodings likely consist of ASCII, latin1 or UTF-8. As
|
||||||
a special exception, UTF-16 is prevalent in Windows environments
|
a special exception, UTF-16 is prevalent in Windows environments
|
||||||
|
|
||||||
In light of the above, here is how ripgrep behaves:
|
In light of the above, here is how ripgrep behaves when `--encoding auto` is
|
||||||
|
given, which is the default:
|
||||||
|
|
||||||
* All input is assumed to be ASCII compatible (which means every byte that
|
* All input is assumed to be ASCII compatible (which means every byte that
|
||||||
corresponds to an ASCII codepoint actually is an ASCII codepoint). This
|
corresponds to an ASCII codepoint actually is an ASCII codepoint). This
|
||||||
@@ -675,10 +681,10 @@ used options that will likely impact how you use ripgrep on a regular basis.
|
|||||||
* `--files`: Print the files that ripgrep *would* search, but don't actually
|
* `--files`: Print the files that ripgrep *would* search, but don't actually
|
||||||
search them.
|
search them.
|
||||||
* `-a/--text`: Search binary files as if they were plain text.
|
* `-a/--text`: Search binary files as if they were plain text.
|
||||||
* `-z/--search-zip`: Search compressed files (gzip, bzip2, lzma, xz). This is
|
* `-z/--search-zip`: Search compressed files (gzip, bzip2, lzma, xz, lz4,
|
||||||
disabled by default.
|
brotli, zstd). This is disabled by default.
|
||||||
* `-C/--context`: Show the lines surrounding a match.
|
* `-C/--context`: Show the lines surrounding a match.
|
||||||
* `--sort-files`: Force ripgrep to sort its output by file name. (This disables
|
* `--sort path`: Force ripgrep to sort its output by file name. (This disables
|
||||||
parallelism, so it might be slower.)
|
parallelism, so it might be slower.)
|
||||||
* `-L/--follow`: Follow symbolic links while recursively searching.
|
* `-L/--follow`: Follow symbolic links while recursively searching.
|
||||||
* `-M/--max-columns`: Limit the length of lines printed by ripgrep.
|
* `-M/--max-columns`: Limit the length of lines printed by ripgrep.
|
||||||
|
|||||||
179
README.md
179
README.md
@@ -7,7 +7,7 @@ available for [every release](https://github.com/BurntSushi/ripgrep/releases).
|
|||||||
ripgrep is similar to other popular search tools like The Silver Searcher,
|
ripgrep is similar to other popular search tools like The Silver Searcher,
|
||||||
ack and grep.
|
ack and grep.
|
||||||
|
|
||||||
[](https://travis-ci.org/BurntSushi/ripgrep)
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
[](https://crates.io/crates/ripgrep)
|
[](https://crates.io/crates/ripgrep)
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@ Please see the [CHANGELOG](CHANGELOG.md) for a release history.
|
|||||||
* [Installation](#installation)
|
* [Installation](#installation)
|
||||||
* [User Guide](GUIDE.md)
|
* [User Guide](GUIDE.md)
|
||||||
* [Frequently Asked Questions](FAQ.md)
|
* [Frequently Asked Questions](FAQ.md)
|
||||||
* [Regex syntax](https://docs.rs/regex/0.2.5/regex/#syntax)
|
* [Regex syntax](https://docs.rs/regex/1/regex/#syntax)
|
||||||
* [Configuration files](GUIDE.md#configuration-file)
|
* [Configuration files](GUIDE.md#configuration-file)
|
||||||
* [Shell completions](FAQ.md#complete)
|
* [Shell completions](FAQ.md#complete)
|
||||||
* [Building](#building)
|
* [Building](#building)
|
||||||
@@ -85,14 +85,16 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
|
|||||||
|
|
||||||
### Why should I use ripgrep?
|
### Why should I use ripgrep?
|
||||||
|
|
||||||
* It can replace many use cases served by both The Silver Searcher and GNU grep
|
* It can replace many use cases served by other search tools
|
||||||
because it is generally faster than both. (See [the FAQ](FAQ.md#posix4ever)
|
because it contains most of their features and is generally faster. (See
|
||||||
for more details on whether ripgrep can truly replace grep.)
|
[the FAQ](FAQ.md#posix4ever) for more details on whether ripgrep can truly
|
||||||
* Like The Silver Searcher, ripgrep defaults to recursive directory search
|
replace grep.)
|
||||||
and won't search files ignored by your `.gitignore` files. It also ignores
|
* Like other tools specialized to code search, ripgrep defaults to recursive
|
||||||
hidden and binary files by default. ripgrep also implements full support
|
directory search and won't search files ignored by your `.gitignore` files.
|
||||||
for `.gitignore`, whereas there are many bugs related to that functionality
|
It also ignores hidden and binary files by default. ripgrep also implements
|
||||||
in The Silver Searcher.
|
full support for `.gitignore`, whereas there are many bugs related to that
|
||||||
|
functionality in other code search tools claiming to provide the same
|
||||||
|
functionality.
|
||||||
* ripgrep can search specific types of files. For example, `rg -tpy foo`
|
* ripgrep can search specific types of files. For example, `rg -tpy foo`
|
||||||
limits your search to Python files and `rg -Tjs foo` excludes Javascript
|
limits your search to Python files and `rg -Tjs foo` excludes Javascript
|
||||||
files from your search. ripgrep can be taught about new file types with
|
files from your search. ripgrep can be taught about new file types with
|
||||||
@@ -101,6 +103,10 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
|
|||||||
of search results, searching multiple patterns, highlighting matches with
|
of search results, searching multiple patterns, highlighting matches with
|
||||||
color and full Unicode support. Unlike GNU grep, ripgrep stays fast while
|
color and full Unicode support. Unlike GNU grep, ripgrep stays fast while
|
||||||
supporting Unicode (which is always on).
|
supporting Unicode (which is always on).
|
||||||
|
* ripgrep has optional support for switching its regex engine to use PCRE2.
|
||||||
|
Among other things, this makes it possible to use look-around and
|
||||||
|
backreferences in your patterns, which are not supported in ripgrep's default
|
||||||
|
regex engine. PCRE2 support is enabled with `-P`.
|
||||||
* ripgrep supports searching files in text encodings other than UTF-8, such
|
* ripgrep supports searching files in text encodings other than UTF-8, such
|
||||||
as UTF-16, latin-1, GBK, EUC-JP, Shift_JIS and more. (Some support for
|
as UTF-16, latin-1, GBK, EUC-JP, Shift_JIS and more. (Some support for
|
||||||
automatically detecting UTF-16 is provided. Other text encodings must be
|
automatically detecting UTF-16 is provided. Other text encodings must be
|
||||||
@@ -112,27 +118,29 @@ increases the times to `2.640s` for ripgrep and `10.277s` for GNU grep.
|
|||||||
detection and so on.
|
detection and so on.
|
||||||
|
|
||||||
In other words, use ripgrep if you like speed, filtering by default, fewer
|
In other words, use ripgrep if you like speed, filtering by default, fewer
|
||||||
bugs, and Unicode support.
|
bugs and Unicode support.
|
||||||
|
|
||||||
|
|
||||||
### Why shouldn't I use ripgrep?
|
### Why shouldn't I use ripgrep?
|
||||||
|
|
||||||
I'd like to try to convince you why you *shouldn't* use ripgrep. This should
|
Despite initially not wanting to add every feature under the sun to ripgrep,
|
||||||
give you a glimpse at some important downsides or missing features of
|
over time, ripgrep has grown support for most features found in other file
|
||||||
ripgrep.
|
searching tools. This includes searching for results spanning across multiple
|
||||||
|
lines, and opt-in support for PCRE2, which provides look-around and
|
||||||
|
backreference support.
|
||||||
|
|
||||||
* ripgrep uses a regex engine based on finite automata, so if you want fancy
|
At this point, the primary reasons not to use ripgrep probably consist of one
|
||||||
regex features such as backreferences or lookaround, ripgrep won't provide
|
or more of the following:
|
||||||
them to you. ripgrep does support lots of things though, including, but not
|
|
||||||
limited to: lazy quantification (e.g., `a+?`), repetitions (e.g., `a{2,5}`),
|
|
||||||
begin/end assertions (e.g., `^\w+$`), word boundaries (e.g., `\bfoo\b`), and
|
|
||||||
support for Unicode categories (e.g., `\p{Sc}` to match currency symbols or
|
|
||||||
`\p{Lu}` to match any uppercase letter). (Fancier regexes will never be
|
|
||||||
supported.)
|
|
||||||
* ripgrep doesn't have multiline search. (Will happen as an opt-in feature.)
|
|
||||||
|
|
||||||
In other words, if you like fancy regexes or multiline search, then ripgrep
|
* You need a portable and ubiquitous tool. While ripgrep works on Windows,
|
||||||
may not quite meet your needs (yet).
|
macOS and Linux, it is not ubiquitous and it does not conform to any
|
||||||
|
standard such as POSIX. The best tool for this job is good old grep.
|
||||||
|
* There still exists some other feature (or bug) not listed in this README that
|
||||||
|
you rely on that's in another tool that isn't in ripgrep.
|
||||||
|
* There is a performance edge case where ripgrep doesn't do well where another
|
||||||
|
tool does do well. (Please file a bug report!)
|
||||||
|
* ripgrep isn't possible to install on your machine or isn't available for your
|
||||||
|
platform. (Please file a bug report!)
|
||||||
|
|
||||||
|
|
||||||
### Is it really faster than everything else?
|
### Is it really faster than everything else?
|
||||||
@@ -145,7 +153,8 @@ Summarizing, ripgrep is fast because:
|
|||||||
* It is built on top of
|
* It is built on top of
|
||||||
[Rust's regex engine](https://github.com/rust-lang-nursery/regex).
|
[Rust's regex engine](https://github.com/rust-lang-nursery/regex).
|
||||||
Rust's regex engine uses finite automata, SIMD and aggressive literal
|
Rust's regex engine uses finite automata, SIMD and aggressive literal
|
||||||
optimizations to make searching very fast.
|
optimizations to make searching very fast. (PCRE2 support can be opted into
|
||||||
|
with the `-P/--pcre2` flag.)
|
||||||
* Rust's regex library maintains performance with full Unicode support by
|
* Rust's regex library maintains performance with full Unicode support by
|
||||||
building UTF-8 decoding directly into its deterministic finite automaton
|
building UTF-8 decoding directly into its deterministic finite automaton
|
||||||
engine.
|
engine.
|
||||||
@@ -154,7 +163,7 @@ Summarizing, ripgrep is fast because:
|
|||||||
latter is better for large directories. ripgrep chooses the best searching
|
latter is better for large directories. ripgrep chooses the best searching
|
||||||
strategy for you automatically.
|
strategy for you automatically.
|
||||||
* Applies your ignore patterns in `.gitignore` files using a
|
* Applies your ignore patterns in `.gitignore` files using a
|
||||||
[`RegexSet`](https://docs.rs/regex/1.0.0/regex/struct.RegexSet.html).
|
[`RegexSet`](https://docs.rs/regex/1/regex/struct.RegexSet.html).
|
||||||
That means a single file path can be matched against multiple glob patterns
|
That means a single file path can be matched against multiple glob patterns
|
||||||
simultaneously.
|
simultaneously.
|
||||||
* It uses a lock-free parallel recursive directory iterator, courtesy of
|
* It uses a lock-free parallel recursive directory iterator, courtesy of
|
||||||
@@ -168,6 +177,11 @@ Andy Lester, author of [ack](https://beyondgrep.com/), has published an
|
|||||||
excellent table comparing the features of ack, ag, git-grep, GNU grep and
|
excellent table comparing the features of ack, ag, git-grep, GNU grep and
|
||||||
ripgrep: https://beyondgrep.com/feature-comparison/
|
ripgrep: https://beyondgrep.com/feature-comparison/
|
||||||
|
|
||||||
|
Note that ripgrep has grown a few significant new features recently that
|
||||||
|
are not yet present in Andy's table. This includes, but is not limited to,
|
||||||
|
configuration files, passthru, support for searching compressed files,
|
||||||
|
multiline search and opt-in fancy regex support via PCRE2.
|
||||||
|
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
@@ -207,13 +221,15 @@ If you're a **MacPorts** user, then you can install ripgrep from the
|
|||||||
$ sudo port install ripgrep
|
$ sudo port install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **Windows Chocolatey** user, then you can install ripgrep from the [official repo](https://chocolatey.org/packages/ripgrep):
|
If you're a **Windows Chocolatey** user, then you can install ripgrep from the
|
||||||
|
[official repo](https://chocolatey.org/packages/ripgrep):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ choco install ripgrep
|
$ choco install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **Windows Scoop** user, then you can install ripgrep from the [official bucket](https://github.com/lukesampson/scoop/blob/master/bucket/ripgrep.json):
|
If you're a **Windows Scoop** user, then you can install ripgrep from the
|
||||||
|
[official bucket](https://github.com/lukesampson/scoop/blob/master/bucket/ripgrep.json):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ scoop install ripgrep
|
$ scoop install ripgrep
|
||||||
@@ -225,32 +241,38 @@ If you're an **Arch Linux** user, then you can install ripgrep from the official
|
|||||||
$ pacman -S ripgrep
|
$ pacman -S ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **Gentoo** user, you can install ripgrep from the [official repo](https://packages.gentoo.org/packages/sys-apps/ripgrep):
|
If you're a **Gentoo** user, you can install ripgrep from the
|
||||||
|
[official repo](https://packages.gentoo.org/packages/sys-apps/ripgrep):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ emerge sys-apps/ripgrep
|
$ emerge sys-apps/ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **Fedora 27+** user, you can install ripgrep from official repositories.
|
If you're a **Fedora** user, you can install ripgrep from official
|
||||||
|
repositories.
|
||||||
|
|
||||||
```
|
```
|
||||||
$ sudo dnf install ripgrep
|
$ sudo dnf install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **Fedora 24+** user, you can install ripgrep from [copr](https://copr.fedorainfracloud.org/coprs/carlwgeorge/ripgrep/):
|
If you're an **openSUSE Leap 15.0** user, you can install ripgrep from the
|
||||||
|
[utilities repo](https://build.opensuse.org/package/show/utilities/ripgrep):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ sudo dnf copr enable carlwgeorge/ripgrep
|
$ sudo zypper ar https://download.opensuse.org/repositories/utilities/openSUSE_Leap_15.0/utilities.repo
|
||||||
$ sudo dnf install ripgrep
|
$ sudo zypper install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're an **openSUSE Tumbleweed** user, you can install ripgrep from the [official repo](http://software.opensuse.org/package/ripgrep):
|
|
||||||
|
If you're an **openSUSE Tumbleweed** user, you can install ripgrep from the
|
||||||
|
[official repo](http://software.opensuse.org/package/ripgrep):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ sudo zypper install ripgrep
|
$ sudo zypper install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **RHEL/CentOS 7** user, you can install ripgrep from [copr](https://copr.fedorainfracloud.org/coprs/carlwgeorge/ripgrep/):
|
If you're a **RHEL/CentOS 7** user, you can install ripgrep from
|
||||||
|
[copr](https://copr.fedorainfracloud.org/coprs/carlwgeorge/ripgrep/):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ sudo yum-config-manager --add-repo=https://copr.fedorainfracloud.org/coprs/carlwgeorge/ripgrep/repo/epel-7/carlwgeorge-ripgrep-epel-7.repo
|
$ sudo yum-config-manager --add-repo=https://copr.fedorainfracloud.org/coprs/carlwgeorge/ripgrep/repo/epel-7/carlwgeorge-ripgrep-epel-7.repo
|
||||||
@@ -267,12 +289,25 @@ $ # (Or using the attribute name, which is also ripgrep.)
|
|||||||
|
|
||||||
If you're a **Debian** user (or a user of a Debian derivative like **Ubuntu**),
|
If you're a **Debian** user (or a user of a Debian derivative like **Ubuntu**),
|
||||||
then ripgrep can be installed using a binary `.deb` file provided in each
|
then ripgrep can be installed using a binary `.deb` file provided in each
|
||||||
[ripgrep release](https://github.com/BurntSushi/ripgrep/releases). Note that
|
[ripgrep release](https://github.com/BurntSushi/ripgrep/releases).
|
||||||
ripgrep is not in the official Debian or Ubuntu repositories.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
$ curl -LO https://github.com/BurntSushi/ripgrep/releases/download/0.8.1/ripgrep_0.8.1_amd64.deb
|
$ curl -LO https://github.com/BurntSushi/ripgrep/releases/download/0.10.0/ripgrep_0.10.0_amd64.deb
|
||||||
$ sudo dpkg -i ripgrep_0.8.1_amd64.deb
|
$ sudo dpkg -i ripgrep_0.10.0_amd64.deb
|
||||||
|
```
|
||||||
|
|
||||||
|
If you run Debian Buster (currently Debian testing) or Debian sid, ripgrep is
|
||||||
|
[officially maintained by Debian](https://tracker.debian.org/pkg/rust-ripgrep).
|
||||||
|
```
|
||||||
|
$ sudo apt-get install ripgrep
|
||||||
|
```
|
||||||
|
|
||||||
|
If you're an **Ubuntu Cosmic (18.10)** (or newer) user, ripgrep is
|
||||||
|
[available](https://launchpad.net/ubuntu/+source/rust-ripgrep) using the same
|
||||||
|
packaging as Debian:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ sudo apt-get install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
(N.B. Various snaps for ripgrep on Ubuntu are also available, but none of them
|
(N.B. Various snaps for ripgrep on Ubuntu are also available, but none of them
|
||||||
@@ -280,26 +315,30 @@ seem to work right and generate a number of very strange bug reports that I
|
|||||||
don't know how to fix and don't have the time to fix. Therefore, it is no
|
don't know how to fix and don't have the time to fix. Therefore, it is no
|
||||||
longer a recommended installation option.)
|
longer a recommended installation option.)
|
||||||
|
|
||||||
If you're a **FreeBSD** user, then you can install ripgrep from the [official ports](https://www.freshports.org/textproc/ripgrep/):
|
If you're a **FreeBSD** user, then you can install ripgrep from the
|
||||||
|
[official ports](https://www.freshports.org/textproc/ripgrep/):
|
||||||
|
|
||||||
```
|
```
|
||||||
# pkg install ripgrep
|
# pkg install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're an **OpenBSD** user, then you can install ripgrep from the [official ports](http://openports.se/textproc/ripgrep):
|
If you're an **OpenBSD** user, then you can install ripgrep from the
|
||||||
|
[official ports](http://openports.se/textproc/ripgrep):
|
||||||
|
|
||||||
```
|
```
|
||||||
$ doas pkg_add ripgrep
|
$ doas pkg_add ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **NetBSD** user, then you can install ripgrep from [pkgsrc](http://pkgsrc.se/textproc/ripgrep):
|
If you're a **NetBSD** user, then you can install ripgrep from
|
||||||
|
[pkgsrc](http://pkgsrc.se/textproc/ripgrep):
|
||||||
|
|
||||||
```
|
```
|
||||||
# pkgin install ripgrep
|
# pkgin install ripgrep
|
||||||
```
|
```
|
||||||
|
|
||||||
If you're a **Rust programmer**, ripgrep can be installed with `cargo`.
|
If you're a **Rust programmer**, ripgrep can be installed with `cargo`.
|
||||||
* Note that the minimum supported version of Rust for ripgrep is **1.23.0**,
|
|
||||||
|
* Note that the minimum supported version of Rust for ripgrep is **1.32.0**,
|
||||||
although ripgrep may work with older versions.
|
although ripgrep may work with older versions.
|
||||||
* Note that the binary may be bigger than expected because it contains debug
|
* Note that the binary may be bigger than expected because it contains debug
|
||||||
symbols. This is intentional. To remove debug symbols and therefore reduce
|
symbols. This is intentional. To remove debug symbols and therefore reduce
|
||||||
@@ -320,7 +359,10 @@ ripgrep isn't currently in any other package repositories.
|
|||||||
|
|
||||||
ripgrep is written in Rust, so you'll need to grab a
|
ripgrep is written in Rust, so you'll need to grab a
|
||||||
[Rust installation](https://www.rust-lang.org/) in order to compile it.
|
[Rust installation](https://www.rust-lang.org/) in order to compile it.
|
||||||
ripgrep compiles with Rust 1.23.0 (stable) or newer. Building is easy:
|
ripgrep compiles with Rust 1.32.0 (stable) or newer. In general, ripgrep tracks
|
||||||
|
the latest stable release of the Rust compiler.
|
||||||
|
|
||||||
|
To build ripgrep:
|
||||||
|
|
||||||
```
|
```
|
||||||
$ git clone https://github.com/BurntSushi/ripgrep
|
$ git clone https://github.com/BurntSushi/ripgrep
|
||||||
@@ -334,18 +376,47 @@ If you have a Rust nightly compiler and a recent Intel CPU, then you can enable
|
|||||||
additional optional SIMD acceleration like so:
|
additional optional SIMD acceleration like so:
|
||||||
|
|
||||||
```
|
```
|
||||||
RUSTFLAGS="-C target-cpu=native" cargo build --release --features 'simd-accel avx-accel'
|
RUSTFLAGS="-C target-cpu=native" cargo build --release --features 'simd-accel'
|
||||||
```
|
```
|
||||||
|
|
||||||
If your machine doesn't support AVX instructions, then simply remove
|
The `simd-accel` feature enables SIMD support in certain ripgrep dependencies
|
||||||
`avx-accel` from the features list. Similarly for SIMD (which corresponds
|
(responsible for transcoding). They are not necessary to get SIMD optimizations
|
||||||
roughly to SSE instructions).
|
for search; those are enabled automatically. Hopefully, some day, the
|
||||||
|
`simd-accel` feature will similarly become unnecessary. **WARNING:** Currently,
|
||||||
|
enabling this option can increase compilation times dramatically.
|
||||||
|
|
||||||
The `simd-accel` and `avx-accel` features enable SIMD support in certain
|
Finally, optional PCRE2 support can be built with ripgrep by enabling the
|
||||||
ripgrep dependencies (responsible for counting lines and transcoding). They
|
`pcre2` feature:
|
||||||
are not necessary to get SIMD optimizations for search; those are enabled
|
|
||||||
automatically. Hopefully, some day, the `simd-accel` and `avx-accel` features
|
```
|
||||||
will similarly become unnecessary.
|
$ cargo build --release --features 'pcre2'
|
||||||
|
```
|
||||||
|
|
||||||
|
(Tip: use `--features 'pcre2 simd-accel'` to also include compile time SIMD
|
||||||
|
optimizations, which will only work with a nightly compiler.)
|
||||||
|
|
||||||
|
Enabling the PCRE2 feature works with a stable Rust compiler and will
|
||||||
|
attempt to automatically find and link with your system's PCRE2 library via
|
||||||
|
`pkg-config`. If one doesn't exist, then ripgrep will build PCRE2 from source
|
||||||
|
using your system's C compiler and then statically link it into the final
|
||||||
|
executable. Static linking can be forced even when there is an available PCRE2
|
||||||
|
system library by either building ripgrep with the MUSL target or by setting
|
||||||
|
`PCRE2_SYS_STATIC=1`.
|
||||||
|
|
||||||
|
ripgrep can be built with the MUSL target on Linux by first installing the MUSL
|
||||||
|
library on your system (consult your friendly neighborhood package manager).
|
||||||
|
Then you just need to add MUSL support to your Rust toolchain and rebuild
|
||||||
|
ripgrep, which yields a fully static executable:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ rustup target add x86_64-unknown-linux-musl
|
||||||
|
$ cargo build --release --target x86_64-unknown-linux-musl
|
||||||
|
```
|
||||||
|
|
||||||
|
Applying the `--features` flag from above works as expected. If you want to
|
||||||
|
build a static executable with MUSL and with PCRE2, then you will need to have
|
||||||
|
`musl-gcc` installed, which might be in a separate package from the actual
|
||||||
|
MUSL library, depending on your Linux distribution.
|
||||||
|
|
||||||
|
|
||||||
### Running tests
|
### Running tests
|
||||||
|
|||||||
39
appveyor.yml
39
appveyor.yml
@@ -1,8 +1,6 @@
|
|||||||
# Inspired from https://github.com/habitat-sh/habitat/blob/master/appveyor.yml
|
|
||||||
cache:
|
cache:
|
||||||
- c:\cargo\registry
|
- c:\cargo\registry
|
||||||
- c:\cargo\git
|
- c:\cargo\git
|
||||||
- c:\projects\ripgrep\target
|
|
||||||
|
|
||||||
init:
|
init:
|
||||||
- mkdir c:\cargo
|
- mkdir c:\cargo
|
||||||
@@ -19,14 +17,20 @@ environment:
|
|||||||
PROJECT_NAME: ripgrep
|
PROJECT_NAME: ripgrep
|
||||||
RUST_BACKTRACE: full
|
RUST_BACKTRACE: full
|
||||||
matrix:
|
matrix:
|
||||||
- TARGET: i686-pc-windows-gnu
|
|
||||||
CHANNEL: stable
|
|
||||||
- TARGET: i686-pc-windows-msvc
|
|
||||||
CHANNEL: stable
|
|
||||||
- TARGET: x86_64-pc-windows-gnu
|
- TARGET: x86_64-pc-windows-gnu
|
||||||
CHANNEL: stable
|
CHANNEL: stable
|
||||||
|
BITS: 64
|
||||||
|
MSYS2: 1
|
||||||
- TARGET: x86_64-pc-windows-msvc
|
- TARGET: x86_64-pc-windows-msvc
|
||||||
CHANNEL: stable
|
CHANNEL: stable
|
||||||
|
BITS: 64
|
||||||
|
- TARGET: i686-pc-windows-gnu
|
||||||
|
CHANNEL: stable
|
||||||
|
BITS: 32
|
||||||
|
MSYS2: 1
|
||||||
|
- TARGET: i686-pc-windows-msvc
|
||||||
|
CHANNEL: stable
|
||||||
|
BITS: 32
|
||||||
|
|
||||||
matrix:
|
matrix:
|
||||||
fast_finish: true
|
fast_finish: true
|
||||||
@@ -35,27 +39,27 @@ matrix:
|
|||||||
# (Based on from https://github.com/rust-lang/libc/blob/master/appveyor.yml)
|
# (Based on from https://github.com/rust-lang/libc/blob/master/appveyor.yml)
|
||||||
install:
|
install:
|
||||||
- curl -sSf -o rustup-init.exe https://win.rustup.rs/
|
- curl -sSf -o rustup-init.exe https://win.rustup.rs/
|
||||||
- rustup-init.exe -y --default-host %TARGET% --no-modify-path
|
- rustup-init.exe -y --default-host %TARGET%
|
||||||
- if defined MSYS2_BITS set PATH=%PATH%;C:\msys64\mingw%MSYS2_BITS%\bin
|
- set PATH=%PATH%;C:\Users\appveyor\.cargo\bin
|
||||||
|
- if defined MSYS2 set PATH=C:\msys64\mingw%BITS%\bin;%PATH%
|
||||||
- rustc -V
|
- rustc -V
|
||||||
- cargo -V
|
- cargo -V
|
||||||
|
|
||||||
# ???
|
# Hack to work around a harmless warning in Appveyor builds?
|
||||||
build: false
|
build: false
|
||||||
|
|
||||||
# Equivalent to Travis' `script` phase
|
# Equivalent to Travis' `script` phase
|
||||||
# TODO modify this phase as you see fit
|
|
||||||
test_script:
|
test_script:
|
||||||
- cargo test --verbose --all
|
- cargo test --verbose --all --features pcre2
|
||||||
|
|
||||||
before_deploy:
|
before_deploy:
|
||||||
# Generate artifacts for release
|
# Generate artifacts for release
|
||||||
- cargo build --release
|
- cargo build --release --features pcre2
|
||||||
- mkdir staging
|
- mkdir staging
|
||||||
- copy target\release\rg.exe staging
|
- copy target\release\rg.exe staging
|
||||||
- ps: copy target\release\build\ripgrep-*\out\_rg.ps1 staging
|
- ps: copy target\release\build\ripgrep-*\out\_rg.ps1 staging
|
||||||
- cd staging
|
- cd staging
|
||||||
# release zipfile will look like 'rust-everywhere-v1.2.3-x86_64-pc-windows-msvc'
|
# release zipfile will look like 'ripgrep-1.2.3-x86_64-pc-windows-msvc'
|
||||||
- 7z a ../%PROJECT_NAME%-%APPVEYOR_REPO_TAG_NAME%-%TARGET%.zip *
|
- 7z a ../%PROJECT_NAME%-%APPVEYOR_REPO_TAG_NAME%-%TARGET%.zip *
|
||||||
- appveyor PushArtifact ../%PROJECT_NAME%-%APPVEYOR_REPO_TAG_NAME%-%TARGET%.zip
|
- appveyor PushArtifact ../%PROJECT_NAME%-%APPVEYOR_REPO_TAG_NAME%-%TARGET%.zip
|
||||||
|
|
||||||
@@ -68,17 +72,10 @@ deploy:
|
|||||||
provider: GitHub
|
provider: GitHub
|
||||||
# deploy when a new tag is pushed and only on the stable channel
|
# deploy when a new tag is pushed and only on the stable channel
|
||||||
on:
|
on:
|
||||||
# channel to use to produce the release artifacts
|
|
||||||
# NOTE make sure you only release *once* per target
|
|
||||||
# TODO you may want to pick a different channel
|
|
||||||
CHANNEL: stable
|
CHANNEL: stable
|
||||||
appveyor_repo_tag: true
|
appveyor_repo_tag: true
|
||||||
|
|
||||||
branches:
|
branches:
|
||||||
only:
|
only:
|
||||||
- /\d+\.\d+\.\d+/
|
- /^\d+\.\d+\.\d+$/
|
||||||
- master
|
- master
|
||||||
# - appveyor
|
|
||||||
# - /\d+\.\d+\.\d+/
|
|
||||||
# except:
|
|
||||||
# - master
|
|
||||||
|
|||||||
12
build.rs
12
build.rs
@@ -1,8 +1,3 @@
|
|||||||
#[macro_use]
|
|
||||||
extern crate clap;
|
|
||||||
#[macro_use]
|
|
||||||
extern crate lazy_static;
|
|
||||||
|
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::fs::{self, File};
|
use std::fs::{self, File};
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
@@ -168,7 +163,12 @@ fn formatted_arg(arg: &RGArg) -> io::Result<String> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn formatted_doc_txt(arg: &RGArg) -> io::Result<String> {
|
fn formatted_doc_txt(arg: &RGArg) -> io::Result<String> {
|
||||||
let paragraphs: Vec<&str> = arg.doc_long.split("\n\n").collect();
|
let paragraphs: Vec<String> = arg.doc_long
|
||||||
|
.replace("{", "{")
|
||||||
|
.replace("}", r"}")
|
||||||
|
.split("\n\n")
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
if paragraphs.is_empty() {
|
if paragraphs.is_empty() {
|
||||||
return Err(ioerr(format!("missing docs for --{}", arg.name)));
|
return Err(ioerr(format!("missing docs for --{}", arg.name)));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,13 @@ set -ex
|
|||||||
|
|
||||||
# Generate artifacts for release
|
# Generate artifacts for release
|
||||||
mk_artifacts() {
|
mk_artifacts() {
|
||||||
cargo build --target "$TARGET" --release
|
if is_arm; then
|
||||||
|
cargo build --target "$TARGET" --release
|
||||||
|
else
|
||||||
|
# Technically, MUSL builds will force PCRE2 to get statically compiled,
|
||||||
|
# but we also want PCRE2 statically build for macOS binaries.
|
||||||
|
PCRE2_SYS_STATIC=1 cargo build --target "$TARGET" --release --features 'pcre2'
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
mk_tarball() {
|
mk_tarball() {
|
||||||
|
|||||||
43
ci/build_deb.sh
Executable file
43
ci/build_deb.sh
Executable file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# This script builds a binary dpkg for Debian based distros. It does not
|
||||||
|
# currently run in CI, and is instead run manually and the resulting dpkg is
|
||||||
|
# uploaded to GitHub via the web UI.
|
||||||
|
#
|
||||||
|
# Note that this requires 'cargo deb', which can be installed with
|
||||||
|
# 'cargo install cargo-deb'.
|
||||||
|
#
|
||||||
|
# This should be run from the root of the ripgrep repo.
|
||||||
|
|
||||||
|
if ! command -V cargo-deb > /dev/null 2>&1; then
|
||||||
|
echo "cargo-deb command missing" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 'cargo deb' does not seem to provide a way to specify an asset that is
|
||||||
|
# created at build time, such as ripgrep's man page. To work around this,
|
||||||
|
# we force a debug build, copy out the man page (and shell completions)
|
||||||
|
# produced from that build, put it into a predictable location and then build
|
||||||
|
# the deb, which knows where to look.
|
||||||
|
|
||||||
|
DEPLOY_DIR=deployment/deb
|
||||||
|
mkdir -p "$DEPLOY_DIR"
|
||||||
|
cargo build
|
||||||
|
|
||||||
|
# Find and copy man page.
|
||||||
|
manpage="$(find ./target/debug -name rg.1 -print0 | xargs -0 ls -t | head -n1)"
|
||||||
|
cp "$manpage" "$DEPLOY_DIR/"
|
||||||
|
|
||||||
|
# Do the same for shell completions.
|
||||||
|
compbash="$(find ./target/debug -name rg.bash -print0 | xargs -0 ls -t | head -n1)"
|
||||||
|
cp "$compbash" "$DEPLOY_DIR/"
|
||||||
|
compfish="$(find ./target/debug -name rg.fish -print0 | xargs -0 ls -t | head -n1)"
|
||||||
|
cp "$compfish" "$DEPLOY_DIR/"
|
||||||
|
compzsh="complete/_rg"
|
||||||
|
cp "$compzsh" "$DEPLOY_DIR/"
|
||||||
|
|
||||||
|
# Since we're distributing the dpkg, we don't know whether the user will have
|
||||||
|
# PCRE2 installed, so just do a static build.
|
||||||
|
PCRE2_SYS_STATIC=1 cargo deb
|
||||||
@@ -8,7 +8,11 @@ set -ex
|
|||||||
|
|
||||||
main() {
|
main() {
|
||||||
# Test a normal debug build.
|
# Test a normal debug build.
|
||||||
cargo build --target "$TARGET" --verbose --all
|
if is_arm; then
|
||||||
|
cargo build --target "$TARGET" --verbose
|
||||||
|
else
|
||||||
|
cargo build --target "$TARGET" --verbose --all --features 'pcre2'
|
||||||
|
fi
|
||||||
|
|
||||||
# Show the output of the most recent build.rs stderr.
|
# Show the output of the most recent build.rs stderr.
|
||||||
set +x
|
set +x
|
||||||
@@ -40,7 +44,7 @@ main() {
|
|||||||
"$(dirname "${0}")/test_complete.sh"
|
"$(dirname "${0}")/test_complete.sh"
|
||||||
|
|
||||||
# Run tests for ripgrep and all sub-crates.
|
# Run tests for ripgrep and all sub-crates.
|
||||||
cargo test --target "$TARGET" --verbose --all
|
cargo test --target "$TARGET" --verbose --all --features 'pcre2'
|
||||||
}
|
}
|
||||||
|
|
||||||
main
|
main
|
||||||
|
|||||||
@@ -39,12 +39,14 @@ main() {
|
|||||||
print -rl - 'Comparing options:' "-$rg" "+$_rg"
|
print -rl - 'Comparing options:' "-$rg" "+$_rg"
|
||||||
|
|
||||||
# 'Parse' options out of the `--help` output. To prevent false positives we
|
# 'Parse' options out of the `--help` output. To prevent false positives we
|
||||||
# only look at lines where the first non-white-space character is `-`
|
# only look at lines where the first non-white-space character is `-`, or
|
||||||
|
# where a long option starting with certain letters (see `_rg`) is found.
|
||||||
|
# Occasionally we may have to handle some manually, however
|
||||||
help_args=( ${(f)"$(
|
help_args=( ${(f)"$(
|
||||||
$rg --help |
|
$rg --help |
|
||||||
$rg -- '^\s*-' |
|
$rg -i -- '^\s+--?[a-z0-9]|--[imnp]' |
|
||||||
$rg -io -- '[\t ,](-[a-z0-9]|--[a-z0-9-]+)\b' |
|
$rg -ior '$1' -- $'[\t /\"\'`.,](-[a-z0-9]|--[a-z0-9-]+)\\b' |
|
||||||
tr -d '\t ,' |
|
$rg -v -- --print0 | # False positives
|
||||||
sort -u
|
sort -u
|
||||||
)"} )
|
)"} )
|
||||||
|
|
||||||
@@ -58,8 +60,6 @@ main() {
|
|||||||
comp_args=( ${comp_args%%-[:[]*} ) # Strip everything after -optname-
|
comp_args=( ${comp_args%%-[:[]*} ) # Strip everything after -optname-
|
||||||
comp_args=( ${comp_args%%[:+=[]*} ) # Strip everything after other optspecs
|
comp_args=( ${comp_args%%[:+=[]*} ) # Strip everything after other optspecs
|
||||||
comp_args=( ${comp_args##[^-]*} ) # Remove non-options
|
comp_args=( ${comp_args##[^-]*} ) # Remove non-options
|
||||||
|
|
||||||
# This probably isn't necessary, but we should ensure the same order
|
|
||||||
comp_args=( ${(f)"$( print -rl - $comp_args | sort -u )"} )
|
comp_args=( ${(f)"$( print -rl - $comp_args | sort -u )"} )
|
||||||
|
|
||||||
(( $#help_args )) || {
|
(( $#help_args )) || {
|
||||||
|
|||||||
@@ -55,13 +55,6 @@ gcc_prefix() {
|
|||||||
esac
|
esac
|
||||||
}
|
}
|
||||||
|
|
||||||
is_ssse3_target() {
|
|
||||||
case "$(architecture)" in
|
|
||||||
amd64) return 0 ;;
|
|
||||||
*) return 1 ;;
|
|
||||||
esac
|
|
||||||
}
|
|
||||||
|
|
||||||
is_x86() {
|
is_x86() {
|
||||||
case "$(architecture)" in
|
case "$(architecture)" in
|
||||||
amd64|i386) return 0 ;;
|
amd64|i386) return 0 ;;
|
||||||
|
|||||||
260
complete/_rg
260
complete/_rg
@@ -6,8 +6,8 @@
|
|||||||
# Run ci/test_complete.sh after building to ensure that the options supported by
|
# Run ci/test_complete.sh after building to ensure that the options supported by
|
||||||
# this function stay in synch with the `rg` binary.
|
# this function stay in synch with the `rg` binary.
|
||||||
#
|
#
|
||||||
# @see http://zsh.sourceforge.net/Doc/Release/Completion-System.html
|
# For convenience, a completion reference guide is included at the bottom of
|
||||||
# @see https://github.com/zsh-users/zsh/blob/master/Etc/completion-style-guide
|
# this file.
|
||||||
#
|
#
|
||||||
# Originally based on code from the zsh-users project — see copyright notice
|
# Originally based on code from the zsh-users project — see copyright notice
|
||||||
# below.
|
# below.
|
||||||
@@ -26,8 +26,10 @@ _rg() {
|
|||||||
# style set. Note that this prefix check has to be updated manually to account
|
# style set. Note that this prefix check has to be updated manually to account
|
||||||
# for all of the potential negation options listed below!
|
# for all of the potential negation options listed below!
|
||||||
if
|
if
|
||||||
# (--[imn]* => --ignore*, --messages, --no-*)
|
# We also want to list all of these options during testing
|
||||||
[[ $PREFIX$SUFFIX == --[imn]* ]] ||
|
[[ $_RG_COMPLETE_LIST_ARGS == (1|t*|y*) ]] ||
|
||||||
|
# (--[imnp]* => --ignore*, --messages, --no-*, --pcre2-unicode)
|
||||||
|
[[ $PREFIX$SUFFIX == --[imnp]* ]] ||
|
||||||
zstyle -t ":complete:$curcontext:*" complete-all
|
zstyle -t ":complete:$curcontext:*" complete-all
|
||||||
then
|
then
|
||||||
no=
|
no=
|
||||||
@@ -42,6 +44,12 @@ _rg() {
|
|||||||
'(: * -)'{-h,--help}'[display help information]'
|
'(: * -)'{-h,--help}'[display help information]'
|
||||||
'(: * -)'{-V,--version}'[display version information]'
|
'(: * -)'{-V,--version}'[display version information]'
|
||||||
|
|
||||||
|
+ '(buffered)' # buffering options
|
||||||
|
'--line-buffered[force line buffering]'
|
||||||
|
$no"--no-line-buffered[don't force line buffering]"
|
||||||
|
'--block-buffered[force block buffering]'
|
||||||
|
$no"--no-block-buffered[don't force block buffering]"
|
||||||
|
|
||||||
+ '(case)' # Case-sensitivity options
|
+ '(case)' # Case-sensitivity options
|
||||||
{-i,--ignore-case}'[search case-insensitively]'
|
{-i,--ignore-case}'[search case-insensitively]'
|
||||||
{-s,--case-sensitive}'[search case-sensitively]'
|
{-s,--case-sensitive}'[search case-sensitively]'
|
||||||
@@ -61,11 +69,15 @@ _rg() {
|
|||||||
$no"--no-column[don't show column numbers for matches]"
|
$no"--no-column[don't show column numbers for matches]"
|
||||||
|
|
||||||
+ '(count)' # Counting options
|
+ '(count)' # Counting options
|
||||||
'(passthru)'{-c,--count}'[only show count of matching lines for each file]'
|
{-c,--count}'[only show count of matching lines for each file]'
|
||||||
'(passthru)--count-matches[only show count of individual matches for each file]'
|
'--count-matches[only show count of individual matches for each file]'
|
||||||
|
|
||||||
|
+ '(encoding)' # Encoding options
|
||||||
|
{-E+,--encoding=}'[specify text encoding of files to search]: :_rg_encodings'
|
||||||
|
$no'--no-encoding[use default text encoding]'
|
||||||
|
|
||||||
+ file # File-input options
|
+ file # File-input options
|
||||||
'*'{-f+,--file=}'[specify file containing patterns to search for]: :_files'
|
'(1)*'{-f+,--file=}'[specify file containing patterns to search for]: :_files'
|
||||||
|
|
||||||
+ '(file-match)' # Files with/without match options
|
+ '(file-match)' # Files with/without match options
|
||||||
'(stats)'{-l,--files-with-matches}'[only show names of files with matches]'
|
'(stats)'{-l,--files-with-matches}'[only show names of files with matches]'
|
||||||
@@ -75,6 +87,10 @@ _rg() {
|
|||||||
{-H,--with-filename}'[show file name for matches]'
|
{-H,--with-filename}'[show file name for matches]'
|
||||||
"--no-filename[don't show file name for matches]"
|
"--no-filename[don't show file name for matches]"
|
||||||
|
|
||||||
|
+ '(file-system)' # File system options
|
||||||
|
"--one-file-system[don't descend into directories on other file systems]"
|
||||||
|
$no'--no-one-file-system[descend into directories on other file systems]'
|
||||||
|
|
||||||
+ '(fixed)' # Fixed-string options
|
+ '(fixed)' # Fixed-string options
|
||||||
{-F,--fixed-strings}'[treat pattern as literal string instead of regular expression]'
|
{-F,--fixed-strings}'[treat pattern as literal string instead of regular expression]'
|
||||||
$no"--no-fixed-strings[don't treat pattern as literal string]"
|
$no"--no-fixed-strings[don't treat pattern as literal string]"
|
||||||
@@ -96,8 +112,12 @@ _rg() {
|
|||||||
$no"--no-hidden[don't search hidden files and directories]"
|
$no"--no-hidden[don't search hidden files and directories]"
|
||||||
|
|
||||||
+ '(ignore)' # Ignore-file options
|
+ '(ignore)' # Ignore-file options
|
||||||
"(--no-ignore-global --no-ignore-parent --no-ignore-vcs)--no-ignore[don't respect ignore files]"
|
"(--no-ignore-global --no-ignore-parent --no-ignore-vcs --no-ignore-dot)--no-ignore[don't respect ignore files]"
|
||||||
$no'(--ignore-global --ignore-parent --ignore-vcs)--ignore[respect ignore files]'
|
$no'(--ignore-global --ignore-parent --ignore-vcs --ignore-dot)--ignore[respect ignore files]'
|
||||||
|
|
||||||
|
+ '(ignore-file-case-insensitive)' # Ignore-file case sensitivity options
|
||||||
|
'--ignore-file-case-insensitive[process ignore files case insensitively]'
|
||||||
|
$no'--no-ignore-file-case-insensitive[process ignore files case sensitively]'
|
||||||
|
|
||||||
+ '(ignore-global)' # Global ignore-file options
|
+ '(ignore-global)' # Global ignore-file options
|
||||||
"--no-ignore-global[don't respect global ignore files]"
|
"--no-ignore-global[don't respect global ignore files]"
|
||||||
@@ -111,10 +131,23 @@ _rg() {
|
|||||||
"--no-ignore-vcs[don't respect version control ignore files]"
|
"--no-ignore-vcs[don't respect version control ignore files]"
|
||||||
$no'--ignore-vcs[respect version control ignore files]'
|
$no'--ignore-vcs[respect version control ignore files]'
|
||||||
|
|
||||||
+ '(line)' # Line-number options
|
+ '(ignore-dot)' # .ignore-file options
|
||||||
|
"--no-ignore-dot[don't respect .ignore files]"
|
||||||
|
$no'--ignore-dot[respect .ignore files]'
|
||||||
|
|
||||||
|
+ '(json)' # JSON options
|
||||||
|
'--json[output results in JSON Lines format]'
|
||||||
|
$no"--no-json[don't output results in JSON Lines format]"
|
||||||
|
|
||||||
|
+ '(line-number)' # Line-number options
|
||||||
{-n,--line-number}'[show line numbers for matches]'
|
{-n,--line-number}'[show line numbers for matches]'
|
||||||
{-N,--no-line-number}"[don't show line numbers for matches]"
|
{-N,--no-line-number}"[don't show line numbers for matches]"
|
||||||
|
|
||||||
|
+ '(line-terminator)' # Line-terminator options
|
||||||
|
'--crlf[use CRLF as line terminator]'
|
||||||
|
$no"--no-crlf[don't use CRLF as line terminator]"
|
||||||
|
'(text)--null-data[use NUL as line terminator]'
|
||||||
|
|
||||||
+ '(max-depth)' # Directory-depth options
|
+ '(max-depth)' # Directory-depth options
|
||||||
'--max-depth=[specify max number of directories to descend]:number of directories'
|
'--max-depth=[specify max number of directories to descend]:number of directories'
|
||||||
'!--maxdepth=:number of directories'
|
'!--maxdepth=:number of directories'
|
||||||
@@ -131,17 +164,36 @@ _rg() {
|
|||||||
'--mmap[search using memory maps when possible]'
|
'--mmap[search using memory maps when possible]'
|
||||||
"--no-mmap[don't search using memory maps]"
|
"--no-mmap[don't search using memory maps]"
|
||||||
|
|
||||||
|
+ '(multiline)' # Multiline options
|
||||||
|
{-U,--multiline}'[permit matching across multiple lines]'
|
||||||
|
$no'(multiline-dotall)--no-multiline[restrict matches to at most one line each]'
|
||||||
|
|
||||||
|
+ '(multiline-dotall)' # Multiline DOTALL options
|
||||||
|
'(--no-multiline)--multiline-dotall[allow "." to match newline (with -U)]'
|
||||||
|
$no"(--no-multiline)--no-multiline-dotall[don't allow \".\" to match newline (with -U)]"
|
||||||
|
|
||||||
+ '(only)' # Only-match options
|
+ '(only)' # Only-match options
|
||||||
'(passthru replace)'{-o,--only-matching}'[show only matching part of each line]'
|
{-o,--only-matching}'[show only matching part of each line]'
|
||||||
|
|
||||||
+ '(passthru)' # Pass-through options
|
+ '(passthru)' # Pass-through options
|
||||||
'(--vimgrep count only replace)--passthru[show both matching and non-matching lines]'
|
'(--vimgrep)--passthru[show both matching and non-matching lines]'
|
||||||
'!(--vimgrep count only replace)--passthrough'
|
'!(--vimgrep)--passthrough'
|
||||||
|
|
||||||
|
+ '(pcre2)' # PCRE2 options
|
||||||
|
{-P,--pcre2}'[enable matching with PCRE2]'
|
||||||
|
$no'(pcre2-unicode)--no-pcre2[disable matching with PCRE2]'
|
||||||
|
|
||||||
|
+ '(pcre2-unicode)' # PCRE2 Unicode options
|
||||||
|
$no'(--no-pcre2 --no-pcre2-unicode)--pcre2-unicode[enable PCRE2 Unicode mode (with -P)]'
|
||||||
|
'(--no-pcre2 --pcre2-unicode)--no-pcre2-unicode[disable PCRE2 Unicode mode (with -P)]'
|
||||||
|
|
||||||
+ '(pre)' # Preprocessing options
|
+ '(pre)' # Preprocessing options
|
||||||
'(-z --search-zip)--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
|
'(-z --search-zip)--pre=[specify preprocessor utility]:preprocessor utility:_command_names -e'
|
||||||
$no'--no-pre[disable preprocessor utility]'
|
$no'--no-pre[disable preprocessor utility]'
|
||||||
|
|
||||||
|
+ pre-glob # Preprocessing glob options
|
||||||
|
'*--pre-glob[include/exclude files for preprocessing with --pre]'
|
||||||
|
|
||||||
+ '(pretty-vimgrep)' # Pretty/vimgrep display options
|
+ '(pretty-vimgrep)' # Pretty/vimgrep display options
|
||||||
'(heading)'{-p,--pretty}'[alias for --color=always --heading -n]'
|
'(heading)'{-p,--pretty}'[alias for --color=always --heading -n]'
|
||||||
'(heading passthru)--vimgrep[show results in vim-compatible format]'
|
'(heading passthru)--vimgrep[show results in vim-compatible format]'
|
||||||
@@ -150,21 +202,39 @@ _rg() {
|
|||||||
'(1 file)*'{-e+,--regexp=}'[specify pattern]:pattern'
|
'(1 file)*'{-e+,--regexp=}'[specify pattern]:pattern'
|
||||||
|
|
||||||
+ '(replace)' # Replacement options
|
+ '(replace)' # Replacement options
|
||||||
'(count only passthru)'{-r+,--replace=}'[specify string used to replace matches]:replace string'
|
{-r+,--replace=}'[specify string used to replace matches]:replace string'
|
||||||
|
|
||||||
+ '(sort)' # File-sorting options
|
+ '(sort)' # File-sorting options
|
||||||
'(threads)--sort-files[sort results by file path (disables parallelism)]'
|
'(threads)--sort=[sort results in ascending order (disables parallelism)]:sort method:((
|
||||||
$no"--no-sort-files[don't sort results by file path]"
|
none\:"no sorting"
|
||||||
|
path\:"sort by file path"
|
||||||
|
modified\:"sort by last modified time"
|
||||||
|
accessed\:"sort by last accessed time"
|
||||||
|
created\:"sort by creation time"
|
||||||
|
))'
|
||||||
|
'(threads)--sortr=[sort results in descending order (disables parallelism)]:sort method:((
|
||||||
|
none\:"no sorting"
|
||||||
|
path\:"sort by file path"
|
||||||
|
modified\:"sort by last modified time"
|
||||||
|
accessed\:"sort by last accessed time"
|
||||||
|
created\:"sort by creation time"
|
||||||
|
))'
|
||||||
|
'!(threads)--sort-files[sort results by file path (disables parallelism)]'
|
||||||
|
|
||||||
+ stats # Statistics options
|
+ '(stats)' # Statistics options
|
||||||
'(--files file-match)--stats[show search statistics]'
|
'(--files file-match)--stats[show search statistics]'
|
||||||
|
$no"--no-stats[don't show search statistics]"
|
||||||
|
|
||||||
+ '(text)' # Binary-search options
|
+ '(text)' # Binary-search options
|
||||||
{-a,--text}'[search binary files as if they were text]'
|
{-a,--text}'[search binary files as if they were text]'
|
||||||
$no"--no-text[don't search binary files as if they were text]"
|
$no"(--null-data)--no-text[don't search binary files as if they were text]"
|
||||||
|
|
||||||
+ '(threads)' # Thread-count options
|
+ '(threads)' # Thread-count options
|
||||||
'(--sort-files)'{-j+,--threads=}'[specify approximate number of threads to use]:number of threads'
|
'(sort)'{-j+,--threads=}'[specify approximate number of threads to use]:number of threads'
|
||||||
|
|
||||||
|
+ '(trim)' # Trim options
|
||||||
|
'--trim[trim any ASCII whitespace prefix from each line]'
|
||||||
|
$no"--no-trim[don't trim ASCII whitespace prefix from each line]"
|
||||||
|
|
||||||
+ type # Type options
|
+ type # Type options
|
||||||
'*'{-t+,--type=}'[only search files matching specified type]: :_rg_types'
|
'*'{-t+,--type=}'[only search files matching specified type]: :_rg_types'
|
||||||
@@ -194,7 +264,6 @@ _rg() {
|
|||||||
'--context-separator=[specify string used to separate non-continuous context lines in output]:separator'
|
'--context-separator=[specify string used to separate non-continuous context lines in output]:separator'
|
||||||
'--debug[show debug messages]'
|
'--debug[show debug messages]'
|
||||||
'--dfa-size-limit=[specify upper size limit of generated DFA]:DFA size (bytes)'
|
'--dfa-size-limit=[specify upper size limit of generated DFA]:DFA size (bytes)'
|
||||||
'(-E --encoding)'{-E+,--encoding=}'[specify text encoding of files to search]: :_rg_encodings'
|
|
||||||
"(1 stats)--files[show each file that would be searched (but don't search)]"
|
"(1 stats)--files[show each file that would be searched (but don't search)]"
|
||||||
'*--ignore-file=[specify additional ignore file]:ignore file:_files'
|
'*--ignore-file=[specify additional ignore file]:ignore file:_files'
|
||||||
'(-v --invert-match)'{-v,--invert-match}'[invert matching]'
|
'(-v --invert-match)'{-v,--invert-match}'[invert matching]'
|
||||||
@@ -327,6 +396,157 @@ _rg_types() {
|
|||||||
|
|
||||||
_rg "$@"
|
_rg "$@"
|
||||||
|
|
||||||
|
################################################################################
|
||||||
|
# ZSH COMPLETION REFERENCE
|
||||||
|
#
|
||||||
|
# For the convenience of developers who aren't especially familiar with zsh
|
||||||
|
# completion functions, a brief reference guide follows. This is in no way
|
||||||
|
# comprehensive; it covers just enough of the basic structure, syntax, and
|
||||||
|
# conventions to help someone make simple changes like adding new options. For
|
||||||
|
# more complete documentation regarding zsh completion functions, please see the
|
||||||
|
# following:
|
||||||
|
#
|
||||||
|
# * http://zsh.sourceforge.net/Doc/Release/Completion-System.html
|
||||||
|
# * https://github.com/zsh-users/zsh/blob/master/Etc/completion-style-guide
|
||||||
|
#
|
||||||
|
# OVERVIEW
|
||||||
|
#
|
||||||
|
# Most zsh completion functions are defined in terms of `_arguments`, which is a
|
||||||
|
# shell function that takes a series of argument specifications. The specs for
|
||||||
|
# `rg` are stored in an array, which is common for more complex functions; the
|
||||||
|
# elements of the array are passed to `_arguments` on invocation.
|
||||||
|
#
|
||||||
|
# ARGUMENT-SPECIFICATION SYNTAX
|
||||||
|
#
|
||||||
|
# The following is a contrived example of the argument specs for a simple tool:
|
||||||
|
#
|
||||||
|
# '(: * -)'{-h,--help}'[display help information]'
|
||||||
|
# '(-q -v --quiet --verbose)'{-q,--quiet}'[decrease output verbosity]'
|
||||||
|
# '!(-q -v --quiet --verbose)--silent'
|
||||||
|
# '(-q -v --quiet --verbose)'{-v,--verbose}'[increase output verbosity]'
|
||||||
|
# '--color=[specify when to use colors]:when:(always never auto)'
|
||||||
|
# '*:example file:_files'
|
||||||
|
#
|
||||||
|
# Although there may appear to be six specs here, there are actually nine; we
|
||||||
|
# use brace expansion to combine specs for options that go by multiple names,
|
||||||
|
# like `-q` and `--quiet`. This is customary, and ties in with the fact that zsh
|
||||||
|
# merges completion possibilities together when they have the same description.
|
||||||
|
#
|
||||||
|
# The first line defines the option `-h`/`--help`. With most tools, it isn't
|
||||||
|
# useful to complete anything after `--help` because it effectively overrides
|
||||||
|
# all others; the `(: * -)` at the beginning of the spec tells zsh not to
|
||||||
|
# complete any other operands (`:` and `*`) or options (`-`) after this one has
|
||||||
|
# been used. The `[...]` at the end associates a description with `-h`/`--help`;
|
||||||
|
# as mentioned, zsh will see the identical descriptions and merge these options
|
||||||
|
# together when offering completion possibilities.
|
||||||
|
#
|
||||||
|
# The next line defines `-q`/`--quiet`. Here we don't want to suppress further
|
||||||
|
# completions entirely, but we don't want to offer `-q` if `--quiet` has been
|
||||||
|
# given (since they do the same thing), nor do we want to offer `-v` (since it
|
||||||
|
# doesn't make sense to be quiet and verbose at the same time). We don't need to
|
||||||
|
# tell zsh not to offer `--quiet` a second time, since that's the default
|
||||||
|
# behaviour, but since this line expands to two specs describing `-q` *and*
|
||||||
|
# `--quiet` we do need to explicitly list all of them here.
|
||||||
|
#
|
||||||
|
# The next line defines a hidden option `--silent` — maybe it's a deprecated
|
||||||
|
# synonym for `--quiet`. The leading `!` indicates that zsh shouldn't offer this
|
||||||
|
# option during completion. The benefit of providing a spec for an option that
|
||||||
|
# shouldn't be completed is that, if someone *does* use it, we can correctly
|
||||||
|
# suppress completion of other options afterwards.
|
||||||
|
#
|
||||||
|
# The next line defines `-v`/`--verbose`; this works just like `-q`/`--quiet`.
|
||||||
|
#
|
||||||
|
# The next line defines `--color`. In this example, `--color` doesn't have a
|
||||||
|
# corresponding short option, so we don't need to use brace expansion. Further,
|
||||||
|
# there are no other options it's exclusive with (just itself), so we don't need
|
||||||
|
# to define those at the beginning. However, it does take a mandatory argument.
|
||||||
|
# The `=` at the end of `--color=` indicates that the argument may appear either
|
||||||
|
# like `--color always` or like `--color=always`; this is how most GNU-style
|
||||||
|
# command-line tools work. The corresponding short option would normally use `+`
|
||||||
|
# — for example, `-c+` would allow either `-c always` or `-calways`. For this
|
||||||
|
# option, the arguments are known ahead of time, so we can simply list them in
|
||||||
|
# parentheses at the end (`when` is used as the description for the argument).
|
||||||
|
#
|
||||||
|
# The last line defines an operand (a non-option argument). In this example, the
|
||||||
|
# operand can be used any number of times (the leading `*`), and it should be a
|
||||||
|
# file path, so we tell zsh to call the `_files` function to complete it. The
|
||||||
|
# `example file` in the middle is the description to use for this operand; we
|
||||||
|
# could use a space instead to accept the default provided by `_files`.
|
||||||
|
#
|
||||||
|
# GROUPING ARGUMENT SPECIFICATIONS
|
||||||
|
#
|
||||||
|
# Newer versions of zsh support grouping argument specs together. All specs
|
||||||
|
# following a `+` and then a group name are considered to be members of the
|
||||||
|
# named group. Grouping is useful mostly for organisational purposes; it makes
|
||||||
|
# the relationship between different options more obvious, and makes it easier
|
||||||
|
# to specify exclusions.
|
||||||
|
#
|
||||||
|
# We could rewrite our example above using grouping as follows:
|
||||||
|
#
|
||||||
|
# '(: * -)'{-h,--help}'[display help information]'
|
||||||
|
# '--color=[specify when to use colors]:when:(always never auto)'
|
||||||
|
# '*:example file:_files'
|
||||||
|
# + '(verbosity)'
|
||||||
|
# {-q,--quiet}'[decrease output verbosity]'
|
||||||
|
# '!--silent'
|
||||||
|
# {-v,--verbose}'[increase output verbosity]'
|
||||||
|
#
|
||||||
|
# Here we take advantage of a useful feature of spec grouping — when the group
|
||||||
|
# name is surrounded by parentheses, as in `(verbosity)`, it tells zsh that all
|
||||||
|
# of the options in that group are exclusive with each other. As a result, we
|
||||||
|
# don't need to manually list out the exclusions at the beginning of each
|
||||||
|
# option.
|
||||||
|
#
|
||||||
|
# Groups can also be referred to by name in other argument specs; for example:
|
||||||
|
#
|
||||||
|
# '(xyz)--aaa' '*: :_files'
|
||||||
|
# + xyz --xxx --yyy --zzz
|
||||||
|
#
|
||||||
|
# Here we use the group name `xyz` to tell zsh that `--xxx`, `--yyy`, and
|
||||||
|
# `--zzz` are not to be completed after `--aaa`. This makes the exclusion list
|
||||||
|
# much more compact and reusable.
|
||||||
|
#
|
||||||
|
# CONVENTIONS
|
||||||
|
#
|
||||||
|
# zsh completion functions generally adhere to the following conventions:
|
||||||
|
#
|
||||||
|
# * Use two spaces for indentation
|
||||||
|
# * Combine specs for options with different names using brace expansion
|
||||||
|
# * In combined specs, list the short option first (as in `{-a,--text}`)
|
||||||
|
# * Use `+` or `=` as described above for options that take arguments
|
||||||
|
# * Provide a description for all options, option-arguments, and operands
|
||||||
|
# * Capitalise/punctuate argument descriptions as phrases, not complete
|
||||||
|
# sentences — 'display help information', never 'Display help information.'
|
||||||
|
# (but still capitalise acronyms and proper names)
|
||||||
|
# * Write argument descriptions as verb phrases — 'display x', 'enable y',
|
||||||
|
# 'use z'
|
||||||
|
# * Word descriptions to make it clear when an option expects an argument;
|
||||||
|
# usually this is done with the word 'specify', as in 'specify x' or
|
||||||
|
# 'use specified x')
|
||||||
|
# * Write argument descriptions as tersely as possible — for example, articles
|
||||||
|
# like 'a' and 'the' should be omitted unless it would be confusing
|
||||||
|
#
|
||||||
|
# Other conventions currently used by this function:
|
||||||
|
#
|
||||||
|
# * Order argument specs alphabetically by group name, then option name
|
||||||
|
# * Group options that are directly related, mutually exclusive, or frequently
|
||||||
|
# referenced by other argument specs
|
||||||
|
# * Use only characters in the set [a-z0-9_-] in group names
|
||||||
|
# * Order exclusion lists as follows: short options, long options, groups
|
||||||
|
# * Use American English in descriptions
|
||||||
|
# * Use 'don't' in descriptions instead of 'do not'
|
||||||
|
# * Word descriptions for related options as similarly as possible. For example,
|
||||||
|
# `--foo[enable foo]` and `--no-foo[disable foo]`, or `--foo[use foo]` and
|
||||||
|
# `--no-foo[don't use foo]`
|
||||||
|
# * Word descriptions to make it clear when an option only makes sense with
|
||||||
|
# another option, usually by adding '(with -x)' to the end
|
||||||
|
# * Don't quote strings or variables unnecessarily. When quotes are required,
|
||||||
|
# prefer single-quotes to double-quotes
|
||||||
|
# * Prefix option specs with `$no` when the option serves only to negate the
|
||||||
|
# behaviour of another option that must be provided explicitly by the user.
|
||||||
|
# This prevents rarely used options from cluttering up the completion menu
|
||||||
|
################################################################################
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
# Copyright (c) 2011 Github zsh-users - http://github.com/zsh-users
|
# Copyright (c) 2011 Github zsh-users - http://github.com/zsh-users
|
||||||
# All rights reserved.
|
# All rights reserved.
|
||||||
|
|||||||
@@ -28,27 +28,37 @@ Synopsis
|
|||||||
DESCRIPTION
|
DESCRIPTION
|
||||||
-----------
|
-----------
|
||||||
ripgrep (rg) recursively searches your current directory for a regex pattern.
|
ripgrep (rg) recursively searches your current directory for a regex pattern.
|
||||||
By default, ripgrep will respect your `.gitignore` and automatically skip
|
By default, ripgrep will respect your .gitignore and automatically skip hidden
|
||||||
hidden files/directories and binary files.
|
files/directories and binary files.
|
||||||
|
|
||||||
ripgrep's regex engine uses finite automata and guarantees linear time
|
ripgrep's default regex engine uses finite automata and guarantees linear
|
||||||
searching. Because of this, features like backreferences and arbitrary
|
time searching. Because of this, features like backreferences and arbitrary
|
||||||
lookaround are not supported.
|
look-around are not supported. However, if ripgrep is built with PCRE2, then
|
||||||
|
the *--pcre2* flag can be used to enable backreferences and look-around.
|
||||||
|
|
||||||
|
ripgrep supports configuration files. Set *RIPGREP_CONFIG_PATH* to a
|
||||||
|
configuration file. The file can specify one shell argument per line. Lines
|
||||||
|
starting with *#* are ignored. For more details, see the man page or the
|
||||||
|
*README*.
|
||||||
|
|
||||||
|
|
||||||
REGEX SYNTAX
|
REGEX SYNTAX
|
||||||
------------
|
------------
|
||||||
ripgrep uses Rust's regex engine, which documents its syntax:
|
ripgrep uses Rust's regex engine by default, which documents its syntax:
|
||||||
https://docs.rs/regex/0.2.5/regex/#syntax
|
https://docs.rs/regex/*/regex/#syntax
|
||||||
|
|
||||||
ripgrep uses byte-oriented regexes, which has some additional documentation:
|
ripgrep uses byte-oriented regexes, which has some additional documentation:
|
||||||
https://docs.rs/regex/0.2.5/regex/bytes/index.html#syntax
|
https://docs.rs/regex/*/regex/bytes/index.html#syntax
|
||||||
|
|
||||||
To a first approximation, ripgrep uses Perl-like regexes without look-around or
|
To a first approximation, ripgrep uses Perl-like regexes without look-around or
|
||||||
backreferences. This makes them very similar to the "extended" (ERE) regular
|
backreferences. This makes them very similar to the "extended" (ERE) regular
|
||||||
expressions supported by `egrep`, but with a few additional features like
|
expressions supported by *egrep*, but with a few additional features like
|
||||||
Unicode character classes.
|
Unicode character classes.
|
||||||
|
|
||||||
|
If you're using ripgrep with the *--pcre2* flag, then please consult
|
||||||
|
https://www.pcre.org or the PCRE2 man pages for documentation on the supported
|
||||||
|
syntax.
|
||||||
|
|
||||||
|
|
||||||
POSITIONAL ARGUMENTS
|
POSITIONAL ARGUMENTS
|
||||||
--------------------
|
--------------------
|
||||||
@@ -58,18 +68,37 @@ _PATTERN_::
|
|||||||
|
|
||||||
_PATH_::
|
_PATH_::
|
||||||
A file or directory to search. Directories are searched recursively. Paths
|
A file or directory to search. Directories are searched recursively. Paths
|
||||||
specified expicitly on the command line override glob and ignore rules.
|
specified explicitly on the command line override glob and ignore rules.
|
||||||
|
|
||||||
|
|
||||||
OPTIONS
|
OPTIONS
|
||||||
-------
|
-------
|
||||||
|
Note that for many options, there exist flags to disable them. In some cases,
|
||||||
|
those flags are not listed in a first class way below. For example, the
|
||||||
|
*--column* flag (listed below) enables column numbers in ripgrep's output, but
|
||||||
|
the *--no-column* flag (not listed below) disables them. The reverse can also
|
||||||
|
exist. For example, the *--no-ignore* flag (listed below) disables ripgrep's
|
||||||
|
*gitignore* logic, but the *--ignore* flag (not listed below) enables it. These
|
||||||
|
flags are useful for overriding a ripgrep configuration file on the command
|
||||||
|
line. Each flag's documentation notes whether an inverted flag exists. In all
|
||||||
|
cases, the flag specified last takes precedence.
|
||||||
|
|
||||||
{OPTIONS}
|
{OPTIONS}
|
||||||
|
|
||||||
|
|
||||||
EXIT STATUS
|
EXIT STATUS
|
||||||
-----------
|
-----------
|
||||||
If ripgrep finds a match, then the exit status of the program is 0. If no match
|
If ripgrep finds a match, then the exit status of the program is 0. If no match
|
||||||
could be found, then the exit status is non-zero.
|
could be found, then the exit status is 1. If an error occurred, then the exit
|
||||||
|
status is always 2 unless ripgrep was run with the *--quiet* flag and a match
|
||||||
|
was found. In summary:
|
||||||
|
|
||||||
|
* `0` exit status occurs only when at least one match was found, and if
|
||||||
|
no error occurred, unless *--quiet* was given.
|
||||||
|
* `1` exit status occurs only when no match was found and no error occurred.
|
||||||
|
* `2` exit status occurs when an error occurred. This is true for both
|
||||||
|
catastrophic errors (e.g., a regex syntax error) and for soft errors (e.g.,
|
||||||
|
unable to read a file).
|
||||||
|
|
||||||
|
|
||||||
CONFIGURATION FILES
|
CONFIGURATION FILES
|
||||||
@@ -79,11 +108,11 @@ behavior. The format of the configuration file is an "rc" style and is very
|
|||||||
simple. It is defined by two rules:
|
simple. It is defined by two rules:
|
||||||
|
|
||||||
1. Every line is a shell argument, after trimming ASCII whitespace.
|
1. Every line is a shell argument, after trimming ASCII whitespace.
|
||||||
2. Lines starting with _#_ (optionally preceded by any amount of
|
2. Lines starting with *#* (optionally preceded by any amount of
|
||||||
ASCII whitespace) are ignored.
|
ASCII whitespace) are ignored.
|
||||||
|
|
||||||
ripgrep will look for a single configuration file if and only if the
|
ripgrep will look for a single configuration file if and only if the
|
||||||
_RIPGREP_CONFIG_PATH_ environment variable is set and is non-empty.
|
*RIPGREP_CONFIG_PATH* environment variable is set and is non-empty.
|
||||||
ripgrep will parse shell arguments from this file on startup and will
|
ripgrep will parse shell arguments from this file on startup and will
|
||||||
behave as if the arguments in this file were prepended to any explicit
|
behave as if the arguments in this file were prepended to any explicit
|
||||||
arguments given to ripgrep on the command line.
|
arguments given to ripgrep on the command line.
|
||||||
@@ -145,20 +174,20 @@ SHELL COMPLETION
|
|||||||
Shell completion files are included in the release tarball for Bash, Fish, Zsh
|
Shell completion files are included in the release tarball for Bash, Fish, Zsh
|
||||||
and PowerShell.
|
and PowerShell.
|
||||||
|
|
||||||
For *bash*, move `rg.bash` to `$XDG_CONFIG_HOME/bash_completion`
|
For *bash*, move *rg.bash* to *$XDG_CONFIG_HOME/bash_completion*
|
||||||
or `/etc/bash_completion.d/`.
|
or */etc/bash_completion.d/*.
|
||||||
|
|
||||||
For *fish*, move `rg.fish` to `$HOME/.config/fish/completions`.
|
For *fish*, move *rg.fish* to *$HOME/.config/fish/completions*.
|
||||||
|
|
||||||
For *zsh*, move `_rg` to one of your `$fpath` directories.
|
For *zsh*, move *_rg* to one of your *$fpath* directories.
|
||||||
|
|
||||||
|
|
||||||
CAVEATS
|
CAVEATS
|
||||||
-------
|
-------
|
||||||
ripgrep may abort unexpectedly when using default settings if it searches a
|
ripgrep may abort unexpectedly when using default settings if it searches a
|
||||||
file that is simultaneously truncated. This behavior can be avoided by passing
|
file that is simultaneously truncated. This behavior can be avoided by passing
|
||||||
the --no-mmap flag which will forcefully disable the use of memory maps in all
|
the *--no-mmap* flag which will forcefully disable the use of memory maps in
|
||||||
cases.
|
all cases.
|
||||||
|
|
||||||
|
|
||||||
VERSION
|
VERSION
|
||||||
@@ -170,7 +199,11 @@ HOMEPAGE
|
|||||||
--------
|
--------
|
||||||
https://github.com/BurntSushi/ripgrep
|
https://github.com/BurntSushi/ripgrep
|
||||||
|
|
||||||
Please report bugs and feature requests in the issue tracker.
|
Please report bugs and feature requests in the issue tracker. Please do your
|
||||||
|
best to provide a reproducible test case for bugs. This should include the
|
||||||
|
corpus being searched, the *rg* command, the actual output and the expected
|
||||||
|
output. Please also include the output of running the same *rg* command but
|
||||||
|
with the *--debug* flag.
|
||||||
|
|
||||||
|
|
||||||
AUTHORS
|
AUTHORS
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "globset"
|
name = "globset"
|
||||||
version = "0.4.1" #:version
|
version = "0.4.2" #:version
|
||||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
description = """
|
description = """
|
||||||
Cross platform single glob and glob set matching. Glob set matching is the
|
Cross platform single glob and glob set matching. Glob set matching is the
|
||||||
@@ -19,14 +19,14 @@ name = "globset"
|
|||||||
bench = false
|
bench = false
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
aho-corasick = "0.6.0"
|
aho-corasick = "0.6.8"
|
||||||
fnv = "1.0"
|
fnv = "1.0.6"
|
||||||
log = "0.4"
|
log = "0.4.5"
|
||||||
memchr = "2"
|
memchr = "2.1.0"
|
||||||
regex = "1"
|
regex = "1.1.0"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
glob = "0.2"
|
glob = "0.2.11"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
simd-accel = []
|
simd-accel = []
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ Cross platform single glob and glob set matching. Glob set matching is the
|
|||||||
process of matching one or more glob patterns against a single candidate path
|
process of matching one or more glob patterns against a single candidate path
|
||||||
simultaneously, and returning all of the globs that matched.
|
simultaneously, and returning all of the globs that matched.
|
||||||
|
|
||||||
[](https://travis-ci.org/BurntSushi/ripgrep)
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
[](https://crates.io/crates/globset)
|
[](https://crates.io/crates/globset)
|
||||||
|
|
||||||
|
|||||||
@@ -837,40 +837,66 @@ impl<'a> Parser<'a> {
|
|||||||
|
|
||||||
fn parse_star(&mut self) -> Result<(), Error> {
|
fn parse_star(&mut self) -> Result<(), Error> {
|
||||||
let prev = self.prev;
|
let prev = self.prev;
|
||||||
if self.chars.peek() != Some(&'*') {
|
if self.peek() != Some('*') {
|
||||||
self.push_token(Token::ZeroOrMore)?;
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
assert!(self.bump() == Some('*'));
|
assert!(self.bump() == Some('*'));
|
||||||
if !self.have_tokens()? {
|
if !self.have_tokens()? {
|
||||||
self.push_token(Token::RecursivePrefix)?;
|
if !self.peek().map_or(true, is_separator) {
|
||||||
let next = self.bump();
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
if !next.map(is_separator).unwrap_or(true) {
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
return Err(self.error(ErrorKind::InvalidRecursive));
|
} else {
|
||||||
|
self.push_token(Token::RecursivePrefix)?;
|
||||||
|
assert!(self.bump().map_or(true, is_separator));
|
||||||
}
|
}
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
self.pop_token()?;
|
|
||||||
if !prev.map(is_separator).unwrap_or(false) {
|
if !prev.map(is_separator).unwrap_or(false) {
|
||||||
if self.stack.len() <= 1
|
if self.stack.len() <= 1
|
||||||
|| (prev != Some(',') && prev != Some('{')) {
|
|| (prev != Some(',') && prev != Some('{'))
|
||||||
return Err(self.error(ErrorKind::InvalidRecursive));
|
{
|
||||||
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
|
return Ok(());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
match self.chars.peek() {
|
let is_suffix =
|
||||||
None => {
|
match self.peek() {
|
||||||
assert!(self.bump().is_none());
|
None => {
|
||||||
self.push_token(Token::RecursiveSuffix)
|
assert!(self.bump().is_none());
|
||||||
|
true
|
||||||
|
}
|
||||||
|
Some(',') | Some('}') if self.stack.len() >= 2 => {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
Some(c) if is_separator(c) => {
|
||||||
|
assert!(self.bump().map(is_separator).unwrap_or(false));
|
||||||
|
false
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
|
self.push_token(Token::ZeroOrMore)?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
match self.pop_token()? {
|
||||||
|
Token::RecursivePrefix => {
|
||||||
|
self.push_token(Token::RecursivePrefix)?;
|
||||||
}
|
}
|
||||||
Some(&',') | Some(&'}') if self.stack.len() >= 2 => {
|
Token::RecursiveSuffix => {
|
||||||
self.push_token(Token::RecursiveSuffix)
|
self.push_token(Token::RecursiveSuffix)?;
|
||||||
}
|
}
|
||||||
Some(&c) if is_separator(c) => {
|
_ => {
|
||||||
assert!(self.bump().map(is_separator).unwrap_or(false));
|
if is_suffix {
|
||||||
self.push_token(Token::RecursiveZeroOrMore)
|
self.push_token(Token::RecursiveSuffix)?;
|
||||||
|
} else {
|
||||||
|
self.push_token(Token::RecursiveZeroOrMore)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_ => Err(self.error(ErrorKind::InvalidRecursive)),
|
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_class(&mut self) -> Result<(), Error> {
|
fn parse_class(&mut self) -> Result<(), Error> {
|
||||||
@@ -959,6 +985,10 @@ impl<'a> Parser<'a> {
|
|||||||
self.cur = self.chars.next();
|
self.cur = self.chars.next();
|
||||||
self.cur
|
self.cur
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn peek(&mut self) -> Option<char> {
|
||||||
|
self.chars.peek().map(|&ch| ch)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -1144,13 +1174,6 @@ mod tests {
|
|||||||
syntax!(cls20, "[^a]", vec![classn('a', 'a')]);
|
syntax!(cls20, "[^a]", vec![classn('a', 'a')]);
|
||||||
syntax!(cls21, "[^a-z]", vec![classn('a', 'z')]);
|
syntax!(cls21, "[^a-z]", vec![classn('a', 'z')]);
|
||||||
|
|
||||||
syntaxerr!(err_rseq1, "a**", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_rseq2, "**a", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_rseq3, "a**b", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_rseq4, "***", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_rseq5, "/a**", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_rseq6, "/**a", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_rseq7, "/a**b", ErrorKind::InvalidRecursive);
|
|
||||||
syntaxerr!(err_unclosed1, "[", ErrorKind::UnclosedClass);
|
syntaxerr!(err_unclosed1, "[", ErrorKind::UnclosedClass);
|
||||||
syntaxerr!(err_unclosed2, "[]", ErrorKind::UnclosedClass);
|
syntaxerr!(err_unclosed2, "[]", ErrorKind::UnclosedClass);
|
||||||
syntaxerr!(err_unclosed3, "[!", ErrorKind::UnclosedClass);
|
syntaxerr!(err_unclosed3, "[!", ErrorKind::UnclosedClass);
|
||||||
@@ -1194,8 +1217,30 @@ mod tests {
|
|||||||
toregex!(re8, "[*]", r"^[\*]$");
|
toregex!(re8, "[*]", r"^[\*]$");
|
||||||
toregex!(re9, "[+]", r"^[\+]$");
|
toregex!(re9, "[+]", r"^[\+]$");
|
||||||
toregex!(re10, "+", r"^\+$");
|
toregex!(re10, "+", r"^\+$");
|
||||||
toregex!(re11, "**", r"^.*$");
|
toregex!(re11, "☃", r"^\xe2\x98\x83$");
|
||||||
toregex!(re12, "☃", r"^\xe2\x98\x83$");
|
toregex!(re12, "**", r"^.*$");
|
||||||
|
toregex!(re13, "**/", r"^.*$");
|
||||||
|
toregex!(re14, "**/*", r"^(?:/?|.*/).*$");
|
||||||
|
toregex!(re15, "**/**", r"^.*$");
|
||||||
|
toregex!(re16, "**/**/*", r"^(?:/?|.*/).*$");
|
||||||
|
toregex!(re17, "**/**/**", r"^.*$");
|
||||||
|
toregex!(re18, "**/**/**/*", r"^(?:/?|.*/).*$");
|
||||||
|
toregex!(re19, "a/**", r"^a(?:/?|/.*)$");
|
||||||
|
toregex!(re20, "a/**/**", r"^a(?:/?|/.*)$");
|
||||||
|
toregex!(re21, "a/**/**/**", r"^a(?:/?|/.*)$");
|
||||||
|
toregex!(re22, "a/**/b", r"^a(?:/|/.*/)b$");
|
||||||
|
toregex!(re23, "a/**/**/b", r"^a(?:/|/.*/)b$");
|
||||||
|
toregex!(re24, "a/**/**/**/b", r"^a(?:/|/.*/)b$");
|
||||||
|
toregex!(re25, "**/b", r"^(?:/?|.*/)b$");
|
||||||
|
toregex!(re26, "**/**/b", r"^(?:/?|.*/)b$");
|
||||||
|
toregex!(re27, "**/**/**/b", r"^(?:/?|.*/)b$");
|
||||||
|
toregex!(re28, "a**", r"^a.*.*$");
|
||||||
|
toregex!(re29, "**a", r"^.*.*a$");
|
||||||
|
toregex!(re30, "a**b", r"^a.*.*b$");
|
||||||
|
toregex!(re31, "***", r"^.*.*.*$");
|
||||||
|
toregex!(re32, "/a**", r"^/a.*.*$");
|
||||||
|
toregex!(re33, "/**a", r"^/.*.*a$");
|
||||||
|
toregex!(re34, "/a**b", r"^/a.*.*b$");
|
||||||
|
|
||||||
matches!(match1, "a", "a");
|
matches!(match1, "a", "a");
|
||||||
matches!(match2, "a*b", "a_b");
|
matches!(match2, "a*b", "a_b");
|
||||||
|
|||||||
@@ -143,8 +143,13 @@ pub struct Error {
|
|||||||
/// The kind of error that can occur when parsing a glob pattern.
|
/// The kind of error that can occur when parsing a glob pattern.
|
||||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
pub enum ErrorKind {
|
pub enum ErrorKind {
|
||||||
/// Occurs when a use of `**` is invalid. Namely, `**` can only appear
|
/// **DEPRECATED**.
|
||||||
/// adjacent to a path separator, or the beginning/end of a glob.
|
///
|
||||||
|
/// This error used to occur for consistency with git's glob specification,
|
||||||
|
/// but the specification now accepts all uses of `**`. When `**` does not
|
||||||
|
/// appear adjacent to a path separator or at the beginning/end of a glob,
|
||||||
|
/// it is now treated as two consecutive `*` patterns. As such, this error
|
||||||
|
/// is no longer used.
|
||||||
InvalidRecursive,
|
InvalidRecursive,
|
||||||
/// Occurs when a character class (e.g., `[abc]`) is not closed.
|
/// Occurs when a character class (e.g., `[abc]`) is not closed.
|
||||||
UnclosedClass,
|
UnclosedClass,
|
||||||
@@ -470,7 +475,6 @@ impl GlobSetBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Add a new pattern to this set.
|
/// Add a new pattern to this set.
|
||||||
#[allow(dead_code)]
|
|
||||||
pub fn add(&mut self, pat: Glob) -> &mut GlobSetBuilder {
|
pub fn add(&mut self, pat: Glob) -> &mut GlobSetBuilder {
|
||||||
self.pats.push(pat);
|
self.pats.push(pat);
|
||||||
self
|
self
|
||||||
|
|||||||
25
grep-cli/Cargo.toml
Normal file
25
grep-cli/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
[package]
|
||||||
|
name = "grep-cli"
|
||||||
|
version = "0.1.1" #:version
|
||||||
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
|
description = """
|
||||||
|
Utilities for search oriented command line applications.
|
||||||
|
"""
|
||||||
|
documentation = "https://docs.rs/grep-cli"
|
||||||
|
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
repository = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["regex", "grep", "cli", "utility", "util"]
|
||||||
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
atty = "0.2.11"
|
||||||
|
globset = { version = "0.4.2", path = "../globset" }
|
||||||
|
lazy_static = "1.1.0"
|
||||||
|
log = "0.4.5"
|
||||||
|
regex = "1.1"
|
||||||
|
same-file = "1.0.4"
|
||||||
|
termcolor = "1.0.4"
|
||||||
|
|
||||||
|
[target.'cfg(windows)'.dependencies.winapi-util]
|
||||||
|
version = "0.1.1"
|
||||||
21
grep-cli/LICENSE-MIT
Normal file
21
grep-cli/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Andrew Gallant
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
38
grep-cli/README.md
Normal file
38
grep-cli/README.md
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
grep-cli
|
||||||
|
--------
|
||||||
|
A utility library that provides common routines desired in search oriented
|
||||||
|
command line applications. This includes, but is not limited to, parsing hex
|
||||||
|
escapes, detecting whether stdin is readable and more. To the extent possible,
|
||||||
|
this crate strives for compatibility across Windows, macOS and Linux.
|
||||||
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep-cli)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep-cli](https://docs.rs/grep-cli)
|
||||||
|
|
||||||
|
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||||
|
should prefer the facade defined in the
|
||||||
|
[`grep`](https://docs.rs/grep)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep-cli = "0.1"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep_cli;
|
||||||
|
```
|
||||||
24
grep-cli/UNLICENSE
Normal file
24
grep-cli/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
||||||
382
grep-cli/src/decompress.rs
Normal file
382
grep-cli/src/decompress.rs
Normal file
@@ -0,0 +1,382 @@
|
|||||||
|
use std::ffi::{OsStr, OsString};
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
use globset::{Glob, GlobSet, GlobSetBuilder};
|
||||||
|
|
||||||
|
use process::{CommandError, CommandReader, CommandReaderBuilder};
|
||||||
|
|
||||||
|
/// A builder for a matcher that determines which files get decompressed.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct DecompressionMatcherBuilder {
|
||||||
|
/// The commands for each matching glob.
|
||||||
|
commands: Vec<DecompressionCommand>,
|
||||||
|
/// Whether to include the default matching rules.
|
||||||
|
defaults: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A representation of a single command for decompressing data
|
||||||
|
/// out-of-proccess.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
struct DecompressionCommand {
|
||||||
|
/// The glob that matches this command.
|
||||||
|
glob: String,
|
||||||
|
/// The command or binary name.
|
||||||
|
bin: OsString,
|
||||||
|
/// The arguments to invoke with the command.
|
||||||
|
args: Vec<OsString>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DecompressionMatcherBuilder {
|
||||||
|
fn default() -> DecompressionMatcherBuilder {
|
||||||
|
DecompressionMatcherBuilder::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DecompressionMatcherBuilder {
|
||||||
|
/// Create a new builder for configuring a decompression matcher.
|
||||||
|
pub fn new() -> DecompressionMatcherBuilder {
|
||||||
|
DecompressionMatcherBuilder {
|
||||||
|
commands: vec![],
|
||||||
|
defaults: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a matcher for determining how to decompress files.
|
||||||
|
///
|
||||||
|
/// If there was a problem compiling the matcher, then an error is
|
||||||
|
/// returned.
|
||||||
|
pub fn build(&self) -> Result<DecompressionMatcher, CommandError> {
|
||||||
|
let defaults =
|
||||||
|
if !self.defaults {
|
||||||
|
vec![]
|
||||||
|
} else {
|
||||||
|
default_decompression_commands()
|
||||||
|
};
|
||||||
|
let mut glob_builder = GlobSetBuilder::new();
|
||||||
|
let mut commands = vec![];
|
||||||
|
for decomp_cmd in defaults.iter().chain(&self.commands) {
|
||||||
|
let glob = Glob::new(&decomp_cmd.glob).map_err(|err| {
|
||||||
|
CommandError::io(io::Error::new(io::ErrorKind::Other, err))
|
||||||
|
})?;
|
||||||
|
glob_builder.add(glob);
|
||||||
|
commands.push(decomp_cmd.clone());
|
||||||
|
}
|
||||||
|
let globs = glob_builder.build().map_err(|err| {
|
||||||
|
CommandError::io(io::Error::new(io::ErrorKind::Other, err))
|
||||||
|
})?;
|
||||||
|
Ok(DecompressionMatcher { globs, commands })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When enabled, the default matching rules will be compiled into this
|
||||||
|
/// matcher before any other associations. When disabled, only the
|
||||||
|
/// rules explicitly given to this builder will be used.
|
||||||
|
///
|
||||||
|
/// This is enabled by default.
|
||||||
|
pub fn defaults(&mut self, yes: bool) -> &mut DecompressionMatcherBuilder {
|
||||||
|
self.defaults = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Associates a glob with a command to decompress files matching the glob.
|
||||||
|
///
|
||||||
|
/// If multiple globs match the same file, then the most recently added
|
||||||
|
/// glob takes precedence.
|
||||||
|
///
|
||||||
|
/// The syntax for the glob is documented in the
|
||||||
|
/// [`globset` crate](https://docs.rs/globset/#syntax).
|
||||||
|
pub fn associate<P, I, A>(
|
||||||
|
&mut self,
|
||||||
|
glob: &str,
|
||||||
|
program: P,
|
||||||
|
args: I,
|
||||||
|
) -> &mut DecompressionMatcherBuilder
|
||||||
|
where P: AsRef<OsStr>,
|
||||||
|
I: IntoIterator<Item=A>,
|
||||||
|
A: AsRef<OsStr>,
|
||||||
|
{
|
||||||
|
|
||||||
|
let glob = glob.to_string();
|
||||||
|
let bin = program.as_ref().to_os_string();
|
||||||
|
let args = args
|
||||||
|
.into_iter()
|
||||||
|
.map(|a| a.as_ref().to_os_string())
|
||||||
|
.collect();
|
||||||
|
self.commands.push(DecompressionCommand { glob, bin, args });
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A matcher for determining how to decompress files.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct DecompressionMatcher {
|
||||||
|
/// The set of globs to match. Each glob has a corresponding entry in
|
||||||
|
/// `commands`. When a glob matches, the corresponding command should be
|
||||||
|
/// used to perform out-of-process decompression.
|
||||||
|
globs: GlobSet,
|
||||||
|
/// The commands for each matching glob.
|
||||||
|
commands: Vec<DecompressionCommand>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DecompressionMatcher {
|
||||||
|
fn default() -> DecompressionMatcher {
|
||||||
|
DecompressionMatcher::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DecompressionMatcher {
|
||||||
|
/// Create a new matcher with default rules.
|
||||||
|
///
|
||||||
|
/// To add more matching rules, build a matcher with
|
||||||
|
/// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html).
|
||||||
|
pub fn new() -> DecompressionMatcher {
|
||||||
|
DecompressionMatcherBuilder::new()
|
||||||
|
.build()
|
||||||
|
.expect("built-in matching rules should always compile")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a pre-built command based on the given file path that can
|
||||||
|
/// decompress its contents. If no such decompressor is known, then this
|
||||||
|
/// returns `None`.
|
||||||
|
///
|
||||||
|
/// If there are multiple possible commands matching the given path, then
|
||||||
|
/// the command added last takes precedence.
|
||||||
|
pub fn command<P: AsRef<Path>>(&self, path: P) -> Option<Command> {
|
||||||
|
for i in self.globs.matches(path).into_iter().rev() {
|
||||||
|
let decomp_cmd = &self.commands[i];
|
||||||
|
let mut cmd = Command::new(&decomp_cmd.bin);
|
||||||
|
cmd.args(&decomp_cmd.args);
|
||||||
|
return Some(cmd);
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the given file path has at least one
|
||||||
|
/// matching command to perform decompression on.
|
||||||
|
pub fn has_command<P: AsRef<Path>>(&self, path: P) -> bool {
|
||||||
|
self.globs.is_match(path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configures and builds a streaming reader for decompressing data.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct DecompressionReaderBuilder {
|
||||||
|
matcher: DecompressionMatcher,
|
||||||
|
command_builder: CommandReaderBuilder,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DecompressionReaderBuilder {
|
||||||
|
/// Create a new builder with the default configuration.
|
||||||
|
pub fn new() -> DecompressionReaderBuilder {
|
||||||
|
DecompressionReaderBuilder::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a new streaming reader for decompressing data.
|
||||||
|
///
|
||||||
|
/// If decompression is done out-of-process and if there was a problem
|
||||||
|
/// spawning the process, then its error is logged at the debug level and a
|
||||||
|
/// passthru reader is returned that does no decompression. This behavior
|
||||||
|
/// typically occurs when the given file path matches a decompression
|
||||||
|
/// command, but is executing in an environment where the decompression
|
||||||
|
/// command is not available.
|
||||||
|
///
|
||||||
|
/// If the given file path could not be matched with a decompression
|
||||||
|
/// strategy, then a passthru reader is returned that does no
|
||||||
|
/// decompression.
|
||||||
|
pub fn build<P: AsRef<Path>>(
|
||||||
|
&self,
|
||||||
|
path: P,
|
||||||
|
) -> Result<DecompressionReader, CommandError> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
let mut cmd = match self.matcher.command(path) {
|
||||||
|
None => return DecompressionReader::new_passthru(path),
|
||||||
|
Some(cmd) => cmd,
|
||||||
|
};
|
||||||
|
cmd.arg(path);
|
||||||
|
|
||||||
|
match self.command_builder.build(&mut cmd) {
|
||||||
|
Ok(cmd_reader) => Ok(DecompressionReader { rdr: Ok(cmd_reader) }),
|
||||||
|
Err(err) => {
|
||||||
|
debug!(
|
||||||
|
"{}: error spawning command '{:?}': {} \
|
||||||
|
(falling back to uncompressed reader)",
|
||||||
|
path.display(),
|
||||||
|
cmd,
|
||||||
|
err,
|
||||||
|
);
|
||||||
|
DecompressionReader::new_passthru(path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the matcher to use to look up the decompression command for each
|
||||||
|
/// file path.
|
||||||
|
///
|
||||||
|
/// A set of sensible rules is enabled by default. Setting this will
|
||||||
|
/// completely replace the current rules.
|
||||||
|
pub fn matcher(
|
||||||
|
&mut self,
|
||||||
|
matcher: DecompressionMatcher,
|
||||||
|
) -> &mut DecompressionReaderBuilder {
|
||||||
|
self.matcher = matcher;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the underlying matcher currently used by this builder.
|
||||||
|
pub fn get_matcher(&self) -> &DecompressionMatcher {
|
||||||
|
&self.matcher
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When enabled, the reader will asynchronously read the contents of the
|
||||||
|
/// command's stderr output. When disabled, stderr is only read after the
|
||||||
|
/// stdout stream has been exhausted (or if the process quits with an error
|
||||||
|
/// code).
|
||||||
|
///
|
||||||
|
/// Note that when enabled, this may require launching an additional
|
||||||
|
/// thread in order to read stderr. This is done so that the process being
|
||||||
|
/// executed is never blocked from writing to stdout or stderr. If this is
|
||||||
|
/// disabled, then it is possible for the process to fill up the stderr
|
||||||
|
/// buffer and deadlock.
|
||||||
|
///
|
||||||
|
/// This is enabled by default.
|
||||||
|
pub fn async_stderr(
|
||||||
|
&mut self,
|
||||||
|
yes: bool,
|
||||||
|
) -> &mut DecompressionReaderBuilder {
|
||||||
|
self.command_builder.async_stderr(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A streaming reader for decompressing the contents of a file.
|
||||||
|
///
|
||||||
|
/// The purpose of this reader is to provide a seamless way to decompress the
|
||||||
|
/// contents of file using existing tools in the current environment. This is
|
||||||
|
/// meant to be an alternative to using decompression libraries in favor of the
|
||||||
|
/// simplicity and portability of using external commands such as `gzip` and
|
||||||
|
/// `xz`. This does impose the overhead of spawning a process, so other means
|
||||||
|
/// for performing decompression should be sought if this overhead isn't
|
||||||
|
/// acceptable.
|
||||||
|
///
|
||||||
|
/// A decompression reader comes with a default set of matching rules that are
|
||||||
|
/// meant to associate file paths with the corresponding command to use to
|
||||||
|
/// decompress them. For example, a glob like `*.gz` matches gzip compressed
|
||||||
|
/// files with the command `gzip -d -c`. If a file path does not match any
|
||||||
|
/// existing rules, or if it matches a rule whose command does not exist in the
|
||||||
|
/// current environment, then the decompression reader passes through the
|
||||||
|
/// contents of the underlying file without doing any decompression.
|
||||||
|
///
|
||||||
|
/// The default matching rules are probably good enough for most cases, and if
|
||||||
|
/// they require revision, pull requests are welcome. In cases where they must
|
||||||
|
/// be changed or extended, they can be customized through the use of
|
||||||
|
/// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html)
|
||||||
|
/// and
|
||||||
|
/// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html).
|
||||||
|
///
|
||||||
|
/// By default, this reader will asynchronously read the processes' stderr.
|
||||||
|
/// This prevents subtle deadlocking bugs for noisy processes that write a lot
|
||||||
|
/// to stderr. Currently, the entire contents of stderr is read on to the heap.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// This example shows how to read the decompressed contents of a file without
|
||||||
|
/// needing to explicitly choose the decompression command to run.
|
||||||
|
///
|
||||||
|
/// Note that if you need to decompress multiple files, it is better to use
|
||||||
|
/// `DecompressionReaderBuilder`, which will amortize the cost of compiling the
|
||||||
|
/// matcher.
|
||||||
|
///
|
||||||
|
/// ```no_run
|
||||||
|
/// use std::io::Read;
|
||||||
|
/// use std::process::Command;
|
||||||
|
/// use grep_cli::DecompressionReader;
|
||||||
|
///
|
||||||
|
/// # fn example() -> Result<(), Box<::std::error::Error>> {
|
||||||
|
/// let mut rdr = DecompressionReader::new("/usr/share/man/man1/ls.1.gz")?;
|
||||||
|
/// let mut contents = vec![];
|
||||||
|
/// rdr.read_to_end(&mut contents)?;
|
||||||
|
/// # Ok(()) }
|
||||||
|
/// ```
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct DecompressionReader {
|
||||||
|
rdr: Result<CommandReader, File>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DecompressionReader {
|
||||||
|
/// Build a new streaming reader for decompressing data.
|
||||||
|
///
|
||||||
|
/// If decompression is done out-of-process and if there was a problem
|
||||||
|
/// spawning the process, then its error is returned.
|
||||||
|
///
|
||||||
|
/// If the given file path could not be matched with a decompression
|
||||||
|
/// strategy, then a passthru reader is returned that does no
|
||||||
|
/// decompression.
|
||||||
|
///
|
||||||
|
/// This uses the default matching rules for determining how to decompress
|
||||||
|
/// the given file. To change those matching rules, use
|
||||||
|
/// [`DecompressionReaderBuilder`](struct.DecompressionReaderBuilder.html)
|
||||||
|
/// and
|
||||||
|
/// [`DecompressionMatcherBuilder`](struct.DecompressionMatcherBuilder.html).
|
||||||
|
///
|
||||||
|
/// When creating readers for many paths. it is better to use the builder
|
||||||
|
/// since it will amortize the cost of constructing the matcher.
|
||||||
|
pub fn new<P: AsRef<Path>>(
|
||||||
|
path: P,
|
||||||
|
) -> Result<DecompressionReader, CommandError> {
|
||||||
|
DecompressionReaderBuilder::new().build(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new "passthru" decompression reader that reads from the file
|
||||||
|
/// corresponding to the given path without doing decompression and without
|
||||||
|
/// executing another process.
|
||||||
|
fn new_passthru(path: &Path) -> Result<DecompressionReader, CommandError> {
|
||||||
|
let file = File::open(path)?;
|
||||||
|
Ok(DecompressionReader { rdr: Err(file) })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl io::Read for DecompressionReader {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||||
|
match self.rdr {
|
||||||
|
Ok(ref mut rdr) => rdr.read(buf),
|
||||||
|
Err(ref mut rdr) => rdr.read(buf),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_decompression_commands() -> Vec<DecompressionCommand> {
|
||||||
|
const ARGS_GZIP: &[&str] = &["gzip", "-d", "-c"];
|
||||||
|
const ARGS_BZIP: &[&str] = &["bzip2", "-d", "-c"];
|
||||||
|
const ARGS_XZ: &[&str] = &["xz", "-d", "-c"];
|
||||||
|
const ARGS_LZ4: &[&str] = &["lz4", "-d", "-c"];
|
||||||
|
const ARGS_LZMA: &[&str] = &["xz", "--format=lzma", "-d", "-c"];
|
||||||
|
const ARGS_BROTLI: &[&str] = &["brotli", "-d", "-c"];
|
||||||
|
const ARGS_ZSTD: &[&str] = &["zstd", "-q", "-d", "-c"];
|
||||||
|
|
||||||
|
fn cmd(glob: &str, args: &[&str]) -> DecompressionCommand {
|
||||||
|
DecompressionCommand {
|
||||||
|
glob: glob.to_string(),
|
||||||
|
bin: OsStr::new(&args[0]).to_os_string(),
|
||||||
|
args: args
|
||||||
|
.iter()
|
||||||
|
.skip(1)
|
||||||
|
.map(|s| OsStr::new(s).to_os_string())
|
||||||
|
.collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vec![
|
||||||
|
cmd("*.gz", ARGS_GZIP),
|
||||||
|
cmd("*.tgz", ARGS_GZIP),
|
||||||
|
cmd("*.bz2", ARGS_BZIP),
|
||||||
|
cmd("*.tbz2", ARGS_BZIP),
|
||||||
|
cmd("*.xz", ARGS_XZ),
|
||||||
|
cmd("*.txz", ARGS_XZ),
|
||||||
|
cmd("*.lz4", ARGS_LZ4),
|
||||||
|
cmd("*.lzma", ARGS_LZMA),
|
||||||
|
cmd("*.br", ARGS_BROTLI),
|
||||||
|
cmd("*.zst", ARGS_ZSTD),
|
||||||
|
cmd("*.zstd", ARGS_ZSTD),
|
||||||
|
]
|
||||||
|
}
|
||||||
315
grep-cli/src/escape.rs
Normal file
315
grep-cli/src/escape.rs
Normal file
@@ -0,0 +1,315 @@
|
|||||||
|
use std::ffi::OsStr;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
/// A single state in the state machine used by `unescape`.
|
||||||
|
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||||
|
enum State {
|
||||||
|
/// The state after seeing a `\`.
|
||||||
|
Escape,
|
||||||
|
/// The state after seeing a `\x`.
|
||||||
|
HexFirst,
|
||||||
|
/// The state after seeing a `\x[0-9A-Fa-f]`.
|
||||||
|
HexSecond(char),
|
||||||
|
/// Default state.
|
||||||
|
Literal,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Escapes arbitrary bytes into a human readable string.
|
||||||
|
///
|
||||||
|
/// This converts `\t`, `\r` and `\n` into their escaped forms. It also
|
||||||
|
/// converts the non-printable subset of ASCII in addition to invalid UTF-8
|
||||||
|
/// bytes to hexadecimal escape sequences. Everything else is left as is.
|
||||||
|
///
|
||||||
|
/// The dual of this routine is [`unescape`](fn.unescape.html).
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// This example shows how to convert a byte string that contains a `\n` and
|
||||||
|
/// invalid UTF-8 bytes into a `String`.
|
||||||
|
///
|
||||||
|
/// Pay special attention to the use of raw strings. That is, `r"\n"` is
|
||||||
|
/// equivalent to `"\\n"`.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use grep_cli::escape;
|
||||||
|
///
|
||||||
|
/// assert_eq!(r"foo\nbar\xFFbaz", escape(b"foo\nbar\xFFbaz"));
|
||||||
|
/// ```
|
||||||
|
pub fn escape(mut bytes: &[u8]) -> String {
|
||||||
|
let mut escaped = String::new();
|
||||||
|
while let Some(result) = decode_utf8(bytes) {
|
||||||
|
match result {
|
||||||
|
Ok(cp) => {
|
||||||
|
escape_char(cp, &mut escaped);
|
||||||
|
bytes = &bytes[cp.len_utf8()..];
|
||||||
|
}
|
||||||
|
Err(byte) => {
|
||||||
|
escape_byte(byte, &mut escaped);
|
||||||
|
bytes = &bytes[1..];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
escaped
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Escapes an OS string into a human readable string.
|
||||||
|
///
|
||||||
|
/// This is like [`escape`](fn.escape.html), but accepts an OS string.
|
||||||
|
pub fn escape_os(string: &OsStr) -> String {
|
||||||
|
#[cfg(unix)]
|
||||||
|
fn imp(string: &OsStr) -> String {
|
||||||
|
use std::os::unix::ffi::OsStrExt;
|
||||||
|
|
||||||
|
escape(string.as_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(unix))]
|
||||||
|
fn imp(string: &OsStr) -> String {
|
||||||
|
escape(string.to_string_lossy().as_bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
imp(string)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unescapes a string.
|
||||||
|
///
|
||||||
|
/// It supports a limited set of escape sequences:
|
||||||
|
///
|
||||||
|
/// * `\t`, `\r` and `\n` are mapped to their corresponding ASCII bytes.
|
||||||
|
/// * `\xZZ` hexadecimal escapes are mapped to their byte.
|
||||||
|
///
|
||||||
|
/// Everything else is left as is, including non-hexadecimal escapes like
|
||||||
|
/// `\xGG`.
|
||||||
|
///
|
||||||
|
/// This is useful when it is desirable for a command line argument to be
|
||||||
|
/// capable of specifying arbitrary bytes or otherwise make it easier to
|
||||||
|
/// specify non-printable characters.
|
||||||
|
///
|
||||||
|
/// The dual of this routine is [`escape`](fn.escape.html).
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// This example shows how to convert an escaped string (which is valid UTF-8)
|
||||||
|
/// into a corresponding sequence of bytes. Each escape sequence is mapped to
|
||||||
|
/// its bytes, which may include invalid UTF-8.
|
||||||
|
///
|
||||||
|
/// Pay special attention to the use of raw strings. That is, `r"\n"` is
|
||||||
|
/// equivalent to `"\\n"`.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use grep_cli::unescape;
|
||||||
|
///
|
||||||
|
/// assert_eq!(&b"foo\nbar\xFFbaz"[..], &*unescape(r"foo\nbar\xFFbaz"));
|
||||||
|
/// ```
|
||||||
|
pub fn unescape(s: &str) -> Vec<u8> {
|
||||||
|
use self::State::*;
|
||||||
|
|
||||||
|
let mut bytes = vec![];
|
||||||
|
let mut state = Literal;
|
||||||
|
for c in s.chars() {
|
||||||
|
match state {
|
||||||
|
Escape => {
|
||||||
|
match c {
|
||||||
|
'\\' => { bytes.push(b'\\'); state = Literal; }
|
||||||
|
'n' => { bytes.push(b'\n'); state = Literal; }
|
||||||
|
'r' => { bytes.push(b'\r'); state = Literal; }
|
||||||
|
't' => { bytes.push(b'\t'); state = Literal; }
|
||||||
|
'x' => { state = HexFirst; }
|
||||||
|
c => {
|
||||||
|
bytes.extend(format!(r"\{}", c).into_bytes());
|
||||||
|
state = Literal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HexFirst => {
|
||||||
|
match c {
|
||||||
|
'0'...'9' | 'A'...'F' | 'a'...'f' => {
|
||||||
|
state = HexSecond(c);
|
||||||
|
}
|
||||||
|
c => {
|
||||||
|
bytes.extend(format!(r"\x{}", c).into_bytes());
|
||||||
|
state = Literal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HexSecond(first) => {
|
||||||
|
match c {
|
||||||
|
'0'...'9' | 'A'...'F' | 'a'...'f' => {
|
||||||
|
let ordinal = format!("{}{}", first, c);
|
||||||
|
let byte = u8::from_str_radix(&ordinal, 16).unwrap();
|
||||||
|
bytes.push(byte);
|
||||||
|
state = Literal;
|
||||||
|
}
|
||||||
|
c => {
|
||||||
|
let original = format!(r"\x{}{}", first, c);
|
||||||
|
bytes.extend(original.into_bytes());
|
||||||
|
state = Literal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Literal => {
|
||||||
|
match c {
|
||||||
|
'\\' => { state = Escape; }
|
||||||
|
c => { bytes.extend(c.to_string().as_bytes()); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
match state {
|
||||||
|
Escape => bytes.push(b'\\'),
|
||||||
|
HexFirst => bytes.extend(b"\\x"),
|
||||||
|
HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()),
|
||||||
|
Literal => {}
|
||||||
|
}
|
||||||
|
bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Unescapes an OS string.
|
||||||
|
///
|
||||||
|
/// This is like [`unescape`](fn.unescape.html), but accepts an OS string.
|
||||||
|
///
|
||||||
|
/// Note that this first lossily decodes the given OS string as UTF-8. That
|
||||||
|
/// is, an escaped string (the thing given) should be valid UTF-8.
|
||||||
|
pub fn unescape_os(string: &OsStr) -> Vec<u8> {
|
||||||
|
unescape(&string.to_string_lossy())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds the given codepoint to the given string, escaping it if necessary.
|
||||||
|
fn escape_char(cp: char, into: &mut String) {
|
||||||
|
if cp.is_ascii() {
|
||||||
|
escape_byte(cp as u8, into);
|
||||||
|
} else {
|
||||||
|
into.push(cp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds the given byte to the given string, escaping it if necessary.
|
||||||
|
fn escape_byte(byte: u8, into: &mut String) {
|
||||||
|
match byte {
|
||||||
|
0x21...0x5B | 0x5D...0x7D => into.push(byte as char),
|
||||||
|
b'\n' => into.push_str(r"\n"),
|
||||||
|
b'\r' => into.push_str(r"\r"),
|
||||||
|
b'\t' => into.push_str(r"\t"),
|
||||||
|
b'\\' => into.push_str(r"\\"),
|
||||||
|
_ => into.push_str(&format!(r"\x{:02X}", byte)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
|
||||||
|
///
|
||||||
|
/// If no valid encoding of a codepoint exists at the beginning of the given
|
||||||
|
/// byte slice, then the first byte is returned instead.
|
||||||
|
///
|
||||||
|
/// This returns `None` if and only if `bytes` is empty.
|
||||||
|
fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
|
||||||
|
if bytes.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let len = match utf8_len(bytes[0]) {
|
||||||
|
None => return Some(Err(bytes[0])),
|
||||||
|
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
|
||||||
|
Some(len) => len,
|
||||||
|
};
|
||||||
|
match str::from_utf8(&bytes[..len]) {
|
||||||
|
Ok(s) => Some(Ok(s.chars().next().unwrap())),
|
||||||
|
Err(_) => Some(Err(bytes[0])),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a UTF-8 leading byte, this returns the total number of code units
|
||||||
|
/// in the following encoded codepoint.
|
||||||
|
///
|
||||||
|
/// If the given byte is not a valid UTF-8 leading byte, then this returns
|
||||||
|
/// `None`.
|
||||||
|
fn utf8_len(byte: u8) -> Option<usize> {
|
||||||
|
if byte <= 0x7F {
|
||||||
|
Some(1)
|
||||||
|
} else if byte <= 0b110_11111 {
|
||||||
|
Some(2)
|
||||||
|
} else if byte <= 0b1110_1111 {
|
||||||
|
Some(3)
|
||||||
|
} else if byte <= 0b1111_0111 {
|
||||||
|
Some(4)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::{escape, unescape};
|
||||||
|
|
||||||
|
fn b(bytes: &'static [u8]) -> Vec<u8> {
|
||||||
|
bytes.to_vec()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty() {
|
||||||
|
assert_eq!(b(b""), unescape(r""));
|
||||||
|
assert_eq!(r"", escape(b""));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn backslash() {
|
||||||
|
assert_eq!(b(b"\\"), unescape(r"\\"));
|
||||||
|
assert_eq!(r"\\", escape(b"\\"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nul() {
|
||||||
|
assert_eq!(b(b"\x00"), unescape(r"\x00"));
|
||||||
|
assert_eq!(r"\x00", escape(b"\x00"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nl() {
|
||||||
|
assert_eq!(b(b"\n"), unescape(r"\n"));
|
||||||
|
assert_eq!(r"\n", escape(b"\n"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tab() {
|
||||||
|
assert_eq!(b(b"\t"), unescape(r"\t"));
|
||||||
|
assert_eq!(r"\t", escape(b"\t"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn carriage() {
|
||||||
|
assert_eq!(b(b"\r"), unescape(r"\r"));
|
||||||
|
assert_eq!(r"\r", escape(b"\r"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nothing_simple() {
|
||||||
|
assert_eq!(b(b"\\a"), unescape(r"\a"));
|
||||||
|
assert_eq!(b(b"\\a"), unescape(r"\\a"));
|
||||||
|
assert_eq!(r"\\a", escape(b"\\a"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nothing_hex0() {
|
||||||
|
assert_eq!(b(b"\\x"), unescape(r"\x"));
|
||||||
|
assert_eq!(b(b"\\x"), unescape(r"\\x"));
|
||||||
|
assert_eq!(r"\\x", escape(b"\\x"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nothing_hex1() {
|
||||||
|
assert_eq!(b(b"\\xz"), unescape(r"\xz"));
|
||||||
|
assert_eq!(b(b"\\xz"), unescape(r"\\xz"));
|
||||||
|
assert_eq!(r"\\xz", escape(b"\\xz"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn nothing_hex2() {
|
||||||
|
assert_eq!(b(b"\\xzz"), unescape(r"\xzz"));
|
||||||
|
assert_eq!(b(b"\\xzz"), unescape(r"\\xzz"));
|
||||||
|
assert_eq!(r"\\xzz", escape(b"\\xzz"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn invalid_utf8() {
|
||||||
|
assert_eq!(r"\xFF", escape(b"\xFF"));
|
||||||
|
assert_eq!(r"a\xFFb", escape(b"a\xFFb"));
|
||||||
|
}
|
||||||
|
}
|
||||||
171
grep-cli/src/human.rs
Normal file
171
grep-cli/src/human.rs
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
use std::error;
|
||||||
|
use std::fmt;
|
||||||
|
use std::io;
|
||||||
|
use std::num::ParseIntError;
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
/// An error that occurs when parsing a human readable size description.
|
||||||
|
///
|
||||||
|
/// This error provides a end user friendly message describing why the
|
||||||
|
/// description coudln't be parsed and what the expected format is.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
pub struct ParseSizeError {
|
||||||
|
original: String,
|
||||||
|
kind: ParseSizeErrorKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
enum ParseSizeErrorKind {
|
||||||
|
InvalidFormat,
|
||||||
|
InvalidInt(ParseIntError),
|
||||||
|
Overflow,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ParseSizeError {
|
||||||
|
fn format(original: &str) -> ParseSizeError {
|
||||||
|
ParseSizeError {
|
||||||
|
original: original.to_string(),
|
||||||
|
kind: ParseSizeErrorKind::InvalidFormat,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn int(original: &str, err: ParseIntError) -> ParseSizeError {
|
||||||
|
ParseSizeError {
|
||||||
|
original: original.to_string(),
|
||||||
|
kind: ParseSizeErrorKind::InvalidInt(err),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn overflow(original: &str) -> ParseSizeError {
|
||||||
|
ParseSizeError {
|
||||||
|
original: original.to_string(),
|
||||||
|
kind: ParseSizeErrorKind::Overflow,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for ParseSizeError {
|
||||||
|
fn description(&self) -> &str { "invalid size" }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for ParseSizeError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
use self::ParseSizeErrorKind::*;
|
||||||
|
|
||||||
|
match self.kind {
|
||||||
|
InvalidFormat => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"invalid format for size '{}', which should be a sequence \
|
||||||
|
of digits followed by an optional 'K', 'M' or 'G' \
|
||||||
|
suffix",
|
||||||
|
self.original
|
||||||
|
)
|
||||||
|
}
|
||||||
|
InvalidInt(ref err) => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"invalid integer found in size '{}': {}",
|
||||||
|
self.original,
|
||||||
|
err
|
||||||
|
)
|
||||||
|
}
|
||||||
|
Overflow => {
|
||||||
|
write!(f, "size too big in '{}'", self.original)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<ParseSizeError> for io::Error {
|
||||||
|
fn from(size_err: ParseSizeError) -> io::Error {
|
||||||
|
io::Error::new(io::ErrorKind::Other, size_err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a human readable size like `2M` into a corresponding number of bytes.
|
||||||
|
///
|
||||||
|
/// Supported size suffixes are `K` (for kilobyte), `M` (for megabyte) and `G`
|
||||||
|
/// (for gigabyte). If a size suffix is missing, then the size is interpreted
|
||||||
|
/// as bytes. If the size is too big to fit into a `u64`, then this returns an
|
||||||
|
/// error.
|
||||||
|
///
|
||||||
|
/// Additional suffixes may be added over time.
|
||||||
|
pub fn parse_human_readable_size(size: &str) -> Result<u64, ParseSizeError> {
|
||||||
|
lazy_static! {
|
||||||
|
// Normally I'd just parse something this simple by hand to avoid the
|
||||||
|
// regex dep, but we bring regex in any way for glob matching, so might
|
||||||
|
// as well use it.
|
||||||
|
static ref RE: Regex = Regex::new(r"^([0-9]+)([KMG])?$").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let caps = match RE.captures(size) {
|
||||||
|
Some(caps) => caps,
|
||||||
|
None => return Err(ParseSizeError::format(size)),
|
||||||
|
};
|
||||||
|
let value: u64 = caps[1].parse().map_err(|err| {
|
||||||
|
ParseSizeError::int(size, err)
|
||||||
|
})?;
|
||||||
|
let suffix = match caps.get(2) {
|
||||||
|
None => return Ok(value),
|
||||||
|
Some(cap) => cap.as_str(),
|
||||||
|
};
|
||||||
|
let bytes = match suffix {
|
||||||
|
"K" => value.checked_mul(1<<10),
|
||||||
|
"M" => value.checked_mul(1<<20),
|
||||||
|
"G" => value.checked_mul(1<<30),
|
||||||
|
// Because if the regex matches this group, it must be [KMG].
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
bytes.ok_or_else(|| ParseSizeError::overflow(size))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn suffix_none() {
|
||||||
|
let x = parse_human_readable_size("123").unwrap();
|
||||||
|
assert_eq!(123, x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn suffix_k() {
|
||||||
|
let x = parse_human_readable_size("123K").unwrap();
|
||||||
|
assert_eq!(123 * (1<<10), x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn suffix_m() {
|
||||||
|
let x = parse_human_readable_size("123M").unwrap();
|
||||||
|
assert_eq!(123 * (1<<20), x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn suffix_g() {
|
||||||
|
let x = parse_human_readable_size("123G").unwrap();
|
||||||
|
assert_eq!(123 * (1<<30), x);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn invalid_empty() {
|
||||||
|
assert!(parse_human_readable_size("").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn invalid_non_digit() {
|
||||||
|
assert!(parse_human_readable_size("a").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn invalid_overflow() {
|
||||||
|
assert!(parse_human_readable_size("9999999999999999G").is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn invalid_suffix() {
|
||||||
|
assert!(parse_human_readable_size("123T").is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
251
grep-cli/src/lib.rs
Normal file
251
grep-cli/src/lib.rs
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
/*!
|
||||||
|
This crate provides common routines used in command line applications, with a
|
||||||
|
focus on routines useful for search oriented applications. As a utility
|
||||||
|
library, there is no central type or function. However, a key focus of this
|
||||||
|
crate is to improve failure modes and provide user friendly error messages
|
||||||
|
when things go wrong.
|
||||||
|
|
||||||
|
To the best extent possible, everything in this crate works on Windows, macOS
|
||||||
|
and Linux.
|
||||||
|
|
||||||
|
|
||||||
|
# Standard I/O
|
||||||
|
|
||||||
|
The
|
||||||
|
[`is_readable_stdin`](fn.is_readable_stdin.html),
|
||||||
|
[`is_tty_stderr`](fn.is_tty_stderr.html),
|
||||||
|
[`is_tty_stdin`](fn.is_tty_stdin.html)
|
||||||
|
and
|
||||||
|
[`is_tty_stdout`](fn.is_tty_stdout.html)
|
||||||
|
routines query aspects of standard I/O. `is_readable_stdin` determines whether
|
||||||
|
stdin can be usefully read from, while the `tty` methods determine whether a
|
||||||
|
tty is attached to stdin/stdout/stderr.
|
||||||
|
|
||||||
|
`is_readable_stdin` is useful when writing an application that changes behavior
|
||||||
|
based on whether the application was invoked with data on stdin. For example,
|
||||||
|
`rg foo` might recursively search the current working directory for
|
||||||
|
occurrences of `foo`, but `rg foo < file` might only search the contents of
|
||||||
|
`file`.
|
||||||
|
|
||||||
|
The `tty` methods are useful for similar reasons. Namely, commands like `ls`
|
||||||
|
will change their output depending on whether they are printing to a terminal
|
||||||
|
or not. For example, `ls` shows a file on each line when stdout is redirected
|
||||||
|
to a file or a pipe, but condenses the output to show possibly many files on
|
||||||
|
each line when stdout is connected to a tty.
|
||||||
|
|
||||||
|
|
||||||
|
# Coloring and buffering
|
||||||
|
|
||||||
|
The
|
||||||
|
[`stdout`](fn.stdout.html),
|
||||||
|
[`stdout_buffered_block`](fn.stdout_buffered_block.html)
|
||||||
|
and
|
||||||
|
[`stdout_buffered_line`](fn.stdout_buffered_line.html)
|
||||||
|
routines are alternative constructors for
|
||||||
|
[`StandardStream`](struct.StandardStream.html).
|
||||||
|
A `StandardStream` implements `termcolor::WriteColor`, which provides a way
|
||||||
|
to emit colors to terminals. Its key use is the encapsulation of buffering
|
||||||
|
style. Namely, `stdout` will return a line buffered `StandardStream` if and
|
||||||
|
only if stdout is connected to a tty, and will otherwise return a block
|
||||||
|
buffered `StandardStream`. Line buffering is important for use with a tty
|
||||||
|
because it typically decreases the latency at which the end user sees output.
|
||||||
|
Block buffering is used otherwise because it is faster, and redirecting stdout
|
||||||
|
to a file typically doesn't benefit from the decreased latency that line
|
||||||
|
buffering provides.
|
||||||
|
|
||||||
|
The `stdout_buffered_block` and `stdout_buffered_line` can be used to
|
||||||
|
explicitly set the buffering strategy regardless of whether stdout is connected
|
||||||
|
to a tty or not.
|
||||||
|
|
||||||
|
|
||||||
|
# Escaping
|
||||||
|
|
||||||
|
The
|
||||||
|
[`escape`](fn.escape.html),
|
||||||
|
[`escape_os`](fn.escape_os.html),
|
||||||
|
[`unescape`](fn.unescape.html)
|
||||||
|
and
|
||||||
|
[`unescape_os`](fn.unescape_os.html)
|
||||||
|
routines provide a user friendly way of dealing with UTF-8 encoded strings that
|
||||||
|
can express arbitrary bytes. For example, you might want to accept a string
|
||||||
|
containing arbitrary bytes as a command line argument, but most interactive
|
||||||
|
shells make such strings difficult to type. Instead, we can ask users to use
|
||||||
|
escape sequences.
|
||||||
|
|
||||||
|
For example, `a\xFFz` is itself a valid UTF-8 string corresponding to the
|
||||||
|
following bytes:
|
||||||
|
|
||||||
|
```ignore
|
||||||
|
[b'a', b'\\', b'x', b'F', b'F', b'z']
|
||||||
|
```
|
||||||
|
|
||||||
|
However, we can
|
||||||
|
interpret `\xFF` as an escape sequence with the `unescape`/`unescape_os`
|
||||||
|
routines, which will yield
|
||||||
|
|
||||||
|
```ignore
|
||||||
|
[b'a', b'\xFF', b'z']
|
||||||
|
```
|
||||||
|
|
||||||
|
instead. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
use grep_cli::unescape;
|
||||||
|
|
||||||
|
// Note the use of a raw string!
|
||||||
|
assert_eq!(vec![b'a', b'\xFF', b'z'], unescape(r"a\xFFz"));
|
||||||
|
```
|
||||||
|
|
||||||
|
The `escape`/`escape_os` routines provide the reverse transformation, which
|
||||||
|
makes it easy to show user friendly error messages involving arbitrary bytes.
|
||||||
|
|
||||||
|
|
||||||
|
# Building patterns
|
||||||
|
|
||||||
|
Typically, regular expression patterns must be valid UTF-8. However, command
|
||||||
|
line arguments aren't guaranteed to be valid UTF-8. Unfortunately, the
|
||||||
|
standard library's UTF-8 conversion functions from `OsStr`s do not provide
|
||||||
|
good error messages. However, the
|
||||||
|
[`pattern_from_bytes`](fn.pattern_from_bytes.html)
|
||||||
|
and
|
||||||
|
[`pattern_from_os`](fn.pattern_from_os.html)
|
||||||
|
do, including reporting exactly where the first invalid UTF-8 byte is seen.
|
||||||
|
|
||||||
|
Additionally, it can be useful to read patterns from a file while reporting
|
||||||
|
good error messages that include line numbers. The
|
||||||
|
[`patterns_from_path`](fn.patterns_from_path.html),
|
||||||
|
[`patterns_from_reader`](fn.patterns_from_reader.html)
|
||||||
|
and
|
||||||
|
[`patterns_from_stdin`](fn.patterns_from_stdin.html)
|
||||||
|
routines do just that. If any pattern is found that is invalid UTF-8, then the
|
||||||
|
error includes the file path (if available) along with the line number and the
|
||||||
|
byte offset at which the first invalid UTF-8 byte was observed.
|
||||||
|
|
||||||
|
|
||||||
|
# Read process output
|
||||||
|
|
||||||
|
Sometimes a command line application needs to execute other processes and read
|
||||||
|
its stdout in a streaming fashion. The
|
||||||
|
[`CommandReader`](struct.CommandReader.html)
|
||||||
|
provides this functionality with an explicit goal of improving failure modes.
|
||||||
|
In particular, if the process exits with an error code, then stderr is read
|
||||||
|
and converted into a normal Rust error to show to end users. This makes the
|
||||||
|
underlying failure modes explicit and gives more information to end users for
|
||||||
|
debugging the problem.
|
||||||
|
|
||||||
|
As a special case,
|
||||||
|
[`DecompressionReader`](struct.DecompressionReader.html)
|
||||||
|
provides a way to decompress arbitrary files by matching their file extensions
|
||||||
|
up with corresponding decompression programs (such as `gzip` and `xz`). This
|
||||||
|
is useful as a means of performing simplistic decompression in a portable
|
||||||
|
manner without binding to specific compression libraries. This does come with
|
||||||
|
some overhead though, so if you need to decompress lots of small files, this
|
||||||
|
may not be an appropriate convenience to use.
|
||||||
|
|
||||||
|
Each reader has a corresponding builder for additional configuration, such as
|
||||||
|
whether to read stderr asynchronously in order to avoid deadlock (which is
|
||||||
|
enabled by default).
|
||||||
|
|
||||||
|
|
||||||
|
# Miscellaneous parsing
|
||||||
|
|
||||||
|
The
|
||||||
|
[`parse_human_readable_size`](fn.parse_human_readable_size.html)
|
||||||
|
routine parses strings like `2M` and converts them to the corresponding number
|
||||||
|
of bytes (`2 * 1<<20` in this case). If an invalid size is found, then a good
|
||||||
|
error message is crafted that typically tells the user how to fix the problem.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#![deny(missing_docs)]
|
||||||
|
|
||||||
|
extern crate atty;
|
||||||
|
extern crate globset;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate lazy_static;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate log;
|
||||||
|
extern crate regex;
|
||||||
|
extern crate same_file;
|
||||||
|
extern crate termcolor;
|
||||||
|
#[cfg(windows)]
|
||||||
|
extern crate winapi_util;
|
||||||
|
|
||||||
|
mod decompress;
|
||||||
|
mod escape;
|
||||||
|
mod human;
|
||||||
|
mod pattern;
|
||||||
|
mod process;
|
||||||
|
mod wtr;
|
||||||
|
|
||||||
|
pub use decompress::{
|
||||||
|
DecompressionMatcher, DecompressionMatcherBuilder,
|
||||||
|
DecompressionReader, DecompressionReaderBuilder,
|
||||||
|
};
|
||||||
|
pub use escape::{escape, escape_os, unescape, unescape_os};
|
||||||
|
pub use human::{ParseSizeError, parse_human_readable_size};
|
||||||
|
pub use pattern::{
|
||||||
|
InvalidPatternError,
|
||||||
|
pattern_from_os, pattern_from_bytes,
|
||||||
|
patterns_from_path, patterns_from_reader, patterns_from_stdin,
|
||||||
|
};
|
||||||
|
pub use process::{CommandError, CommandReader, CommandReaderBuilder};
|
||||||
|
pub use wtr::{
|
||||||
|
StandardStream,
|
||||||
|
stdout, stdout_buffered_line, stdout_buffered_block,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Returns true if and only if stdin is believed to be readable.
|
||||||
|
///
|
||||||
|
/// When stdin is readable, command line programs may choose to behave
|
||||||
|
/// differently than when stdin is not readable. For example, `command foo`
|
||||||
|
/// might search the current directory for occurrences of `foo` where as
|
||||||
|
/// `command foo < some-file` or `cat some-file | command foo` might instead
|
||||||
|
/// only search stdin for occurrences of `foo`.
|
||||||
|
pub fn is_readable_stdin() -> bool {
|
||||||
|
#[cfg(unix)]
|
||||||
|
fn imp() -> bool {
|
||||||
|
use std::os::unix::fs::FileTypeExt;
|
||||||
|
use same_file::Handle;
|
||||||
|
|
||||||
|
let ft = match Handle::stdin().and_then(|h| h.as_file().metadata()) {
|
||||||
|
Err(_) => return false,
|
||||||
|
Ok(md) => md.file_type(),
|
||||||
|
};
|
||||||
|
ft.is_file() || ft.is_fifo()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn imp() -> bool {
|
||||||
|
use winapi_util as winutil;
|
||||||
|
|
||||||
|
winutil::file::typ(winutil::HandleRef::stdin())
|
||||||
|
.map(|t| t.is_disk() || t.is_pipe())
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
!is_tty_stdin() && imp()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if stdin is believed to be connectted to a tty
|
||||||
|
/// or a console.
|
||||||
|
pub fn is_tty_stdin() -> bool {
|
||||||
|
atty::is(atty::Stream::Stdin)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if stdout is believed to be connectted to a tty
|
||||||
|
/// or a console.
|
||||||
|
///
|
||||||
|
/// This is useful for when you want your command line program to produce
|
||||||
|
/// different output depending on whether it's printing directly to a user's
|
||||||
|
/// terminal or whether it's being redirected somewhere else. For example,
|
||||||
|
/// implementations of `ls` will often show one item per line when stdout is
|
||||||
|
/// redirected, but will condensed output when printing to a tty.
|
||||||
|
pub fn is_tty_stdout() -> bool {
|
||||||
|
atty::is(atty::Stream::Stdout)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if stderr is believed to be connectted to a tty
|
||||||
|
/// or a console.
|
||||||
|
pub fn is_tty_stderr() -> bool {
|
||||||
|
atty::is(atty::Stream::Stderr)
|
||||||
|
}
|
||||||
205
grep-cli/src/pattern.rs
Normal file
205
grep-cli/src/pattern.rs
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
use std::error;
|
||||||
|
use std::ffi::OsStr;
|
||||||
|
use std::fmt;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufRead};
|
||||||
|
use std::path::Path;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
use escape::{escape, escape_os};
|
||||||
|
|
||||||
|
/// An error that occurs when a pattern could not be converted to valid UTF-8.
|
||||||
|
///
|
||||||
|
/// The purpose of this error is to give a more targeted failure mode for
|
||||||
|
/// patterns written by end users that are not valid UTF-8.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
pub struct InvalidPatternError {
|
||||||
|
original: String,
|
||||||
|
valid_up_to: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl InvalidPatternError {
|
||||||
|
/// Returns the index in the given string up to which valid UTF-8 was
|
||||||
|
/// verified.
|
||||||
|
pub fn valid_up_to(&self) -> usize {
|
||||||
|
self.valid_up_to
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for InvalidPatternError {
|
||||||
|
fn description(&self) -> &str { "invalid pattern" }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for InvalidPatternError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"found invalid UTF-8 in pattern at byte offset {} \
|
||||||
|
(use hex escape sequences to match arbitrary bytes \
|
||||||
|
in a pattern, e.g., \\xFF): '{}'",
|
||||||
|
self.valid_up_to,
|
||||||
|
self.original,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<InvalidPatternError> for io::Error {
|
||||||
|
fn from(paterr: InvalidPatternError) -> io::Error {
|
||||||
|
io::Error::new(io::ErrorKind::Other, paterr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert an OS string into a regular expression pattern.
|
||||||
|
///
|
||||||
|
/// This conversion fails if the given pattern is not valid UTF-8, in which
|
||||||
|
/// case, a targeted error with more information about where the invalid UTF-8
|
||||||
|
/// occurs is given. The error also suggests the use of hex escape sequences,
|
||||||
|
/// which are supported by many regex engines.
|
||||||
|
pub fn pattern_from_os(pattern: &OsStr) -> Result<&str, InvalidPatternError> {
|
||||||
|
pattern.to_str().ok_or_else(|| {
|
||||||
|
let valid_up_to = pattern
|
||||||
|
.to_string_lossy()
|
||||||
|
.find('\u{FFFD}')
|
||||||
|
.expect("a Unicode replacement codepoint for invalid UTF-8");
|
||||||
|
InvalidPatternError {
|
||||||
|
original: escape_os(pattern),
|
||||||
|
valid_up_to: valid_up_to,
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Convert arbitrary bytes into a regular expression pattern.
|
||||||
|
///
|
||||||
|
/// This conversion fails if the given pattern is not valid UTF-8, in which
|
||||||
|
/// case, a targeted error with more information about where the invalid UTF-8
|
||||||
|
/// occurs is given. The error also suggests the use of hex escape sequences,
|
||||||
|
/// which are supported by many regex engines.
|
||||||
|
pub fn pattern_from_bytes(
|
||||||
|
pattern: &[u8],
|
||||||
|
) -> Result<&str, InvalidPatternError> {
|
||||||
|
str::from_utf8(pattern).map_err(|err| {
|
||||||
|
InvalidPatternError {
|
||||||
|
original: escape(pattern),
|
||||||
|
valid_up_to: err.valid_up_to(),
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read patterns from a file path, one per line.
|
||||||
|
///
|
||||||
|
/// If there was a problem reading or if any of the patterns contain invalid
|
||||||
|
/// UTF-8, then an error is returned. If there was a problem with a specific
|
||||||
|
/// pattern, then the error message will include the line number and the file
|
||||||
|
/// path.
|
||||||
|
pub fn patterns_from_path<P: AsRef<Path>>(path: P) -> io::Result<Vec<String>> {
|
||||||
|
let path = path.as_ref();
|
||||||
|
let file = File::open(path).map_err(|err| {
|
||||||
|
io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
format!("{}: {}", path.display(), err),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
patterns_from_reader(file).map_err(|err| {
|
||||||
|
io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
format!("{}:{}", path.display(), err),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read patterns from stdin, one per line.
|
||||||
|
///
|
||||||
|
/// If there was a problem reading or if any of the patterns contain invalid
|
||||||
|
/// UTF-8, then an error is returned. If there was a problem with a specific
|
||||||
|
/// pattern, then the error message will include the line number and the fact
|
||||||
|
/// that it came from stdin.
|
||||||
|
pub fn patterns_from_stdin() -> io::Result<Vec<String>> {
|
||||||
|
let stdin = io::stdin();
|
||||||
|
let locked = stdin.lock();
|
||||||
|
patterns_from_reader(locked).map_err(|err| {
|
||||||
|
io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
format!("<stdin>:{}", err),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read patterns from any reader, one per line.
|
||||||
|
///
|
||||||
|
/// If there was a problem reading or if any of the patterns contain invalid
|
||||||
|
/// UTF-8, then an error is returned. If there was a problem with a specific
|
||||||
|
/// pattern, then the error message will include the line number.
|
||||||
|
///
|
||||||
|
/// Note that this routine uses its own internal buffer, so the caller should
|
||||||
|
/// not provide their own buffered reader if possible.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// This shows how to parse patterns, one per line.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// use grep_cli::patterns_from_reader;
|
||||||
|
///
|
||||||
|
/// # fn example() -> Result<(), Box<::std::error::Error>> {
|
||||||
|
/// let patterns = "\
|
||||||
|
/// foo
|
||||||
|
/// bar\\s+foo
|
||||||
|
/// [a-z]{3}
|
||||||
|
/// ";
|
||||||
|
///
|
||||||
|
/// assert_eq!(patterns_from_reader(patterns.as_bytes())?, vec![
|
||||||
|
/// r"foo",
|
||||||
|
/// r"bar\s+foo",
|
||||||
|
/// r"[a-z]{3}",
|
||||||
|
/// ]);
|
||||||
|
/// # Ok(()) }
|
||||||
|
/// ```
|
||||||
|
pub fn patterns_from_reader<R: io::Read>(rdr: R) -> io::Result<Vec<String>> {
|
||||||
|
let mut patterns = vec![];
|
||||||
|
let mut bufrdr = io::BufReader::new(rdr);
|
||||||
|
let mut line = vec![];
|
||||||
|
let mut line_number = 0;
|
||||||
|
while {
|
||||||
|
line.clear();
|
||||||
|
line_number += 1;
|
||||||
|
bufrdr.read_until(b'\n', &mut line)? > 0
|
||||||
|
} {
|
||||||
|
line.pop().unwrap(); // remove trailing '\n'
|
||||||
|
if line.last() == Some(&b'\r') {
|
||||||
|
line.pop().unwrap();
|
||||||
|
}
|
||||||
|
match pattern_from_bytes(&line) {
|
||||||
|
Ok(pattern) => patterns.push(pattern.to_string()),
|
||||||
|
Err(err) => {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
format!("{}: {}", line_number, err),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(patterns)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bytes() {
|
||||||
|
let pat = b"abc\xFFxyz";
|
||||||
|
let err = pattern_from_bytes(pat).unwrap_err();
|
||||||
|
assert_eq!(3, err.valid_up_to());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(unix)]
|
||||||
|
fn os() {
|
||||||
|
use std::os::unix::ffi::OsStrExt;
|
||||||
|
use std::ffi::OsStr;
|
||||||
|
|
||||||
|
let pat = OsStr::from_bytes(b"abc\xFFxyz");
|
||||||
|
let err = pattern_from_os(pat).unwrap_err();
|
||||||
|
assert_eq!(3, err.valid_up_to());
|
||||||
|
}
|
||||||
|
}
|
||||||
267
grep-cli/src/process.rs
Normal file
267
grep-cli/src/process.rs
Normal file
@@ -0,0 +1,267 @@
|
|||||||
|
use std::error;
|
||||||
|
use std::fmt;
|
||||||
|
use std::io::{self, Read};
|
||||||
|
use std::iter;
|
||||||
|
use std::process;
|
||||||
|
use std::thread::{self, JoinHandle};
|
||||||
|
|
||||||
|
/// An error that can occur while running a command and reading its output.
|
||||||
|
///
|
||||||
|
/// This error can be seamlessly converted to an `io::Error` via a `From`
|
||||||
|
/// implementation.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct CommandError {
|
||||||
|
kind: CommandErrorKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum CommandErrorKind {
|
||||||
|
Io(io::Error),
|
||||||
|
Stderr(Vec<u8>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CommandError {
|
||||||
|
/// Create an error from an I/O error.
|
||||||
|
pub(crate) fn io(ioerr: io::Error) -> CommandError {
|
||||||
|
CommandError { kind: CommandErrorKind::Io(ioerr) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an error from the contents of stderr (which may be empty).
|
||||||
|
pub(crate) fn stderr(bytes: Vec<u8>) -> CommandError {
|
||||||
|
CommandError { kind: CommandErrorKind::Stderr(bytes) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for CommandError {
|
||||||
|
fn description(&self) -> &str { "command error" }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for CommandError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
match self.kind {
|
||||||
|
CommandErrorKind::Io(ref e) => e.fmt(f),
|
||||||
|
CommandErrorKind::Stderr(ref bytes) => {
|
||||||
|
let msg = String::from_utf8_lossy(bytes);
|
||||||
|
if msg.trim().is_empty() {
|
||||||
|
write!(f, "<stderr is empty>")
|
||||||
|
} else {
|
||||||
|
let div = iter::repeat('-').take(79).collect::<String>();
|
||||||
|
write!(f, "\n{div}\n{msg}\n{div}", div=div, msg=msg.trim())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<io::Error> for CommandError {
|
||||||
|
fn from(ioerr: io::Error) -> CommandError {
|
||||||
|
CommandError { kind: CommandErrorKind::Io(ioerr) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<CommandError> for io::Error {
|
||||||
|
fn from(cmderr: CommandError) -> io::Error {
|
||||||
|
match cmderr.kind {
|
||||||
|
CommandErrorKind::Io(ioerr) => ioerr,
|
||||||
|
CommandErrorKind::Stderr(_) => {
|
||||||
|
io::Error::new(io::ErrorKind::Other, cmderr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configures and builds a streaming reader for process output.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct CommandReaderBuilder {
|
||||||
|
async_stderr: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CommandReaderBuilder {
|
||||||
|
/// Create a new builder with the default configuration.
|
||||||
|
pub fn new() -> CommandReaderBuilder {
|
||||||
|
CommandReaderBuilder::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a new streaming reader for the given command's output.
|
||||||
|
///
|
||||||
|
/// The caller should set everything that's required on the given command
|
||||||
|
/// before building a reader, such as its arguments, environment and
|
||||||
|
/// current working directory. Settings such as the stdout and stderr (but
|
||||||
|
/// not stdin) pipes will be overridden so that they can be controlled by
|
||||||
|
/// the reader.
|
||||||
|
///
|
||||||
|
/// If there was a problem spawning the given command, then its error is
|
||||||
|
/// returned.
|
||||||
|
pub fn build(
|
||||||
|
&self,
|
||||||
|
command: &mut process::Command,
|
||||||
|
) -> Result<CommandReader, CommandError> {
|
||||||
|
let mut child = command
|
||||||
|
.stdout(process::Stdio::piped())
|
||||||
|
.stderr(process::Stdio::piped())
|
||||||
|
.spawn()?;
|
||||||
|
let stdout = child.stdout.take().unwrap();
|
||||||
|
let stderr =
|
||||||
|
if self.async_stderr {
|
||||||
|
StderrReader::async(child.stderr.take().unwrap())
|
||||||
|
} else {
|
||||||
|
StderrReader::sync(child.stderr.take().unwrap())
|
||||||
|
};
|
||||||
|
Ok(CommandReader {
|
||||||
|
child: child,
|
||||||
|
stdout: stdout,
|
||||||
|
stderr: stderr,
|
||||||
|
done: false,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When enabled, the reader will asynchronously read the contents of the
|
||||||
|
/// command's stderr output. When disabled, stderr is only read after the
|
||||||
|
/// stdout stream has been exhausted (or if the process quits with an error
|
||||||
|
/// code).
|
||||||
|
///
|
||||||
|
/// Note that when enabled, this may require launching an additional
|
||||||
|
/// thread in order to read stderr. This is done so that the process being
|
||||||
|
/// executed is never blocked from writing to stdout or stderr. If this is
|
||||||
|
/// disabled, then it is possible for the process to fill up the stderr
|
||||||
|
/// buffer and deadlock.
|
||||||
|
///
|
||||||
|
/// This is enabled by default.
|
||||||
|
pub fn async_stderr(&mut self, yes: bool) -> &mut CommandReaderBuilder {
|
||||||
|
self.async_stderr = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A streaming reader for a command's output.
|
||||||
|
///
|
||||||
|
/// The purpose of this reader is to provide an easy way to execute processes
|
||||||
|
/// whose stdout is read in a streaming way while also making the processes'
|
||||||
|
/// stderr available when the process fails with an exit code. This makes it
|
||||||
|
/// possible to execute processes while surfacing the underlying failure mode
|
||||||
|
/// in the case of an error.
|
||||||
|
///
|
||||||
|
/// Moreover, by default, this reader will asynchronously read the processes'
|
||||||
|
/// stderr. This prevents subtle deadlocking bugs for noisy processes that
|
||||||
|
/// write a lot to stderr. Currently, the entire contents of stderr is read
|
||||||
|
/// on to the heap.
|
||||||
|
///
|
||||||
|
/// # Example
|
||||||
|
///
|
||||||
|
/// This example shows how to invoke `gzip` to decompress the contents of a
|
||||||
|
/// file. If the `gzip` command reports a failing exit status, then its stderr
|
||||||
|
/// is returned as an error.
|
||||||
|
///
|
||||||
|
/// ```no_run
|
||||||
|
/// use std::io::Read;
|
||||||
|
/// use std::process::Command;
|
||||||
|
/// use grep_cli::CommandReader;
|
||||||
|
///
|
||||||
|
/// # fn example() -> Result<(), Box<::std::error::Error>> {
|
||||||
|
/// let mut cmd = Command::new("gzip");
|
||||||
|
/// cmd.arg("-d").arg("-c").arg("/usr/share/man/man1/ls.1.gz");
|
||||||
|
///
|
||||||
|
/// let mut rdr = CommandReader::new(&mut cmd)?;
|
||||||
|
/// let mut contents = vec![];
|
||||||
|
/// rdr.read_to_end(&mut contents)?;
|
||||||
|
/// # Ok(()) }
|
||||||
|
/// ```
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct CommandReader {
|
||||||
|
child: process::Child,
|
||||||
|
stdout: process::ChildStdout,
|
||||||
|
stderr: StderrReader,
|
||||||
|
done: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CommandReader {
|
||||||
|
/// Create a new streaming reader for the given command using the default
|
||||||
|
/// configuration.
|
||||||
|
///
|
||||||
|
/// The caller should set everything that's required on the given command
|
||||||
|
/// before building a reader, such as its arguments, environment and
|
||||||
|
/// current working directory. Settings such as the stdout and stderr (but
|
||||||
|
/// not stdin) pipes will be overridden so that they can be controlled by
|
||||||
|
/// the reader.
|
||||||
|
///
|
||||||
|
/// If there was a problem spawning the given command, then its error is
|
||||||
|
/// returned.
|
||||||
|
///
|
||||||
|
/// If the caller requires additional configuration for the reader
|
||||||
|
/// returned, then use
|
||||||
|
/// [`CommandReaderBuilder`](struct.CommandReaderBuilder.html).
|
||||||
|
pub fn new(
|
||||||
|
cmd: &mut process::Command,
|
||||||
|
) -> Result<CommandReader, CommandError> {
|
||||||
|
CommandReaderBuilder::new().build(cmd)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl io::Read for CommandReader {
|
||||||
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||||
|
if self.done {
|
||||||
|
return Ok(0);
|
||||||
|
}
|
||||||
|
let nread = self.stdout.read(buf)?;
|
||||||
|
if nread == 0 {
|
||||||
|
self.done = true;
|
||||||
|
// Reap the child now that we're done reading. If the command
|
||||||
|
// failed, report stderr as an error.
|
||||||
|
if !self.child.wait()?.success() {
|
||||||
|
return Err(io::Error::from(self.stderr.read_to_end()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(nread)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A reader that encapsulates the asynchronous or synchronous reading of
|
||||||
|
/// stderr.
|
||||||
|
#[derive(Debug)]
|
||||||
|
enum StderrReader {
|
||||||
|
Async(Option<JoinHandle<CommandError>>),
|
||||||
|
Sync(process::ChildStderr),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StderrReader {
|
||||||
|
/// Create a reader for stderr that reads contents asynchronously.
|
||||||
|
fn async(mut stderr: process::ChildStderr) -> StderrReader {
|
||||||
|
let handle = thread::spawn(move || {
|
||||||
|
stderr_to_command_error(&mut stderr)
|
||||||
|
});
|
||||||
|
StderrReader::Async(Some(handle))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a reader for stderr that reads contents synchronously.
|
||||||
|
fn sync(stderr: process::ChildStderr) -> StderrReader {
|
||||||
|
StderrReader::Sync(stderr)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consumes all of stderr on to the heap and returns it as an error.
|
||||||
|
///
|
||||||
|
/// If there was a problem reading stderr itself, then this returns an I/O
|
||||||
|
/// command error.
|
||||||
|
fn read_to_end(&mut self) -> CommandError {
|
||||||
|
match *self {
|
||||||
|
StderrReader::Async(ref mut handle) => {
|
||||||
|
let handle = handle
|
||||||
|
.take()
|
||||||
|
.expect("read_to_end cannot be called more than once");
|
||||||
|
handle
|
||||||
|
.join()
|
||||||
|
.expect("stderr reading thread does not panic")
|
||||||
|
}
|
||||||
|
StderrReader::Sync(ref mut stderr) => {
|
||||||
|
stderr_to_command_error(stderr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn stderr_to_command_error(stderr: &mut process::ChildStderr) -> CommandError {
|
||||||
|
let mut bytes = vec![];
|
||||||
|
match stderr.read_to_end(&mut bytes) {
|
||||||
|
Ok(_) => CommandError::stderr(bytes),
|
||||||
|
Err(err) => CommandError::io(err),
|
||||||
|
}
|
||||||
|
}
|
||||||
133
grep-cli/src/wtr.rs
Normal file
133
grep-cli/src/wtr.rs
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
use std::io;
|
||||||
|
|
||||||
|
use termcolor;
|
||||||
|
|
||||||
|
use is_tty_stdout;
|
||||||
|
|
||||||
|
/// A writer that supports coloring with either line or block buffering.
|
||||||
|
pub struct StandardStream(StandardStreamKind);
|
||||||
|
|
||||||
|
/// Returns a possibly buffered writer to stdout for the given color choice.
|
||||||
|
///
|
||||||
|
/// The writer returned is either line buffered or block buffered. The decision
|
||||||
|
/// between these two is made automatically based on whether a tty is attached
|
||||||
|
/// to stdout or not. If a tty is attached, then line buffering is used.
|
||||||
|
/// Otherwise, block buffering is used. In general, block buffering is more
|
||||||
|
/// efficient, but may increase the time it takes for the end user to see the
|
||||||
|
/// first bits of output.
|
||||||
|
///
|
||||||
|
/// If you need more fine grained control over the buffering mode, then use one
|
||||||
|
/// of `stdout_buffered_line` or `stdout_buffered_block`.
|
||||||
|
///
|
||||||
|
/// The color choice given is passed along to the underlying writer. To
|
||||||
|
/// completely disable colors in all cases, use `ColorChoice::Never`.
|
||||||
|
pub fn stdout(color_choice: termcolor::ColorChoice) -> StandardStream {
|
||||||
|
if is_tty_stdout() {
|
||||||
|
stdout_buffered_line(color_choice)
|
||||||
|
} else {
|
||||||
|
stdout_buffered_block(color_choice)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a line buffered writer to stdout for the given color choice.
|
||||||
|
///
|
||||||
|
/// This writer is useful when printing results directly to a tty such that
|
||||||
|
/// users see output as soon as it's written. The downside of this approach
|
||||||
|
/// is that it can be slower, especially when there is a lot of output.
|
||||||
|
///
|
||||||
|
/// You might consider using
|
||||||
|
/// [`stdout`](fn.stdout.html)
|
||||||
|
/// instead, which chooses the buffering strategy automatically based on
|
||||||
|
/// whether stdout is connected to a tty.
|
||||||
|
pub fn stdout_buffered_line(
|
||||||
|
color_choice: termcolor::ColorChoice,
|
||||||
|
) -> StandardStream {
|
||||||
|
let out = termcolor::StandardStream::stdout(color_choice);
|
||||||
|
StandardStream(StandardStreamKind::LineBuffered(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a block buffered writer to stdout for the given color choice.
|
||||||
|
///
|
||||||
|
/// This writer is useful when printing results to a file since it amortizes
|
||||||
|
/// the cost of writing data. The downside of this approach is that it can
|
||||||
|
/// increase the latency of display output when writing to a tty.
|
||||||
|
///
|
||||||
|
/// You might consider using
|
||||||
|
/// [`stdout`](fn.stdout.html)
|
||||||
|
/// instead, which chooses the buffering strategy automatically based on
|
||||||
|
/// whether stdout is connected to a tty.
|
||||||
|
pub fn stdout_buffered_block(
|
||||||
|
color_choice: termcolor::ColorChoice,
|
||||||
|
) -> StandardStream {
|
||||||
|
let out = termcolor::BufferedStandardStream::stdout(color_choice);
|
||||||
|
StandardStream(StandardStreamKind::BlockBuffered(out))
|
||||||
|
}
|
||||||
|
|
||||||
|
enum StandardStreamKind {
|
||||||
|
LineBuffered(termcolor::StandardStream),
|
||||||
|
BlockBuffered(termcolor::BufferedStandardStream),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl io::Write for StandardStream {
|
||||||
|
#[inline]
|
||||||
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||||
|
use self::StandardStreamKind::*;
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
LineBuffered(ref mut w) => w.write(buf),
|
||||||
|
BlockBuffered(ref mut w) => w.write(buf),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
|
use self::StandardStreamKind::*;
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
LineBuffered(ref mut w) => w.flush(),
|
||||||
|
BlockBuffered(ref mut w) => w.flush(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl termcolor::WriteColor for StandardStream {
|
||||||
|
#[inline]
|
||||||
|
fn supports_color(&self) -> bool {
|
||||||
|
use self::StandardStreamKind::*;
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
LineBuffered(ref w) => w.supports_color(),
|
||||||
|
BlockBuffered(ref w) => w.supports_color(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn set_color(&mut self, spec: &termcolor::ColorSpec) -> io::Result<()> {
|
||||||
|
use self::StandardStreamKind::*;
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
LineBuffered(ref mut w) => w.set_color(spec),
|
||||||
|
BlockBuffered(ref mut w) => w.set_color(spec),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn reset(&mut self) -> io::Result<()> {
|
||||||
|
use self::StandardStreamKind::*;
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
LineBuffered(ref mut w) => w.reset(),
|
||||||
|
BlockBuffered(ref mut w) => w.reset(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn is_synchronous(&self) -> bool {
|
||||||
|
use self::StandardStreamKind::*;
|
||||||
|
|
||||||
|
match self.0 {
|
||||||
|
LineBuffered(ref w) => w.is_synchronous(),
|
||||||
|
BlockBuffered(ref w) => w.is_synchronous(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
24
grep-matcher/Cargo.toml
Normal file
24
grep-matcher/Cargo.toml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
[package]
|
||||||
|
name = "grep-matcher"
|
||||||
|
version = "0.1.1" #:version
|
||||||
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
|
description = """
|
||||||
|
A trait for regular expressions, with a focus on line oriented search.
|
||||||
|
"""
|
||||||
|
documentation = "https://docs.rs/grep-matcher"
|
||||||
|
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
repository = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["regex", "pattern", "trait"]
|
||||||
|
license = "Unlicense/MIT"
|
||||||
|
autotests = false
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
memchr = "2.1"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
regex = "1.1"
|
||||||
|
|
||||||
|
[[test]]
|
||||||
|
name = "integration"
|
||||||
|
path = "tests/tests.rs"
|
||||||
21
grep-matcher/LICENSE-MIT
Normal file
21
grep-matcher/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Andrew Gallant
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
36
grep-matcher/README.md
Normal file
36
grep-matcher/README.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
grep-matcher
|
||||||
|
------------
|
||||||
|
This crate provides a low level interface for describing regular expression
|
||||||
|
matchers. The `grep` crate uses this interface in order to make the regex
|
||||||
|
engine it uses pluggable.
|
||||||
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep-matcher)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep-matcher](https://docs.rs/grep-matcher)
|
||||||
|
|
||||||
|
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||||
|
should prefer the facade defined in the
|
||||||
|
[`grep`](https://docs.rs/grep)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep-matcher = "0.1"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep_matcher;
|
||||||
|
```
|
||||||
24
grep-matcher/UNLICENSE
Normal file
24
grep-matcher/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
||||||
328
grep-matcher/src/interpolate.rs
Normal file
328
grep-matcher/src/interpolate.rs
Normal file
@@ -0,0 +1,328 @@
|
|||||||
|
use std::str;
|
||||||
|
|
||||||
|
use memchr::memchr;
|
||||||
|
|
||||||
|
/// Interpolate capture references in `replacement` and write the interpolation
|
||||||
|
/// result to `dst`. References in `replacement` take the form of $N or $name,
|
||||||
|
/// where `N` is a capture group index and `name` is a capture group name. The
|
||||||
|
/// function provided, `name_to_index`, maps capture group names to indices.
|
||||||
|
///
|
||||||
|
/// The `append` function given is responsible for writing the replacement
|
||||||
|
/// to the `dst` buffer. That is, it is called with the capture group index
|
||||||
|
/// of a capture group reference and is expected to resolve the index to its
|
||||||
|
/// corresponding matched text. If no such match exists, then `append` should
|
||||||
|
/// not write anything to its given buffer.
|
||||||
|
pub fn interpolate<A, N>(
|
||||||
|
mut replacement: &[u8],
|
||||||
|
mut append: A,
|
||||||
|
mut name_to_index: N,
|
||||||
|
dst: &mut Vec<u8>,
|
||||||
|
) where
|
||||||
|
A: FnMut(usize, &mut Vec<u8>),
|
||||||
|
N: FnMut(&str) -> Option<usize>
|
||||||
|
{
|
||||||
|
while !replacement.is_empty() {
|
||||||
|
match memchr(b'$', replacement) {
|
||||||
|
None => break,
|
||||||
|
Some(i) => {
|
||||||
|
dst.extend(&replacement[..i]);
|
||||||
|
replacement = &replacement[i..];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if replacement.get(1).map_or(false, |&b| b == b'$') {
|
||||||
|
dst.push(b'$');
|
||||||
|
replacement = &replacement[2..];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
debug_assert!(!replacement.is_empty());
|
||||||
|
let cap_ref = match find_cap_ref(replacement) {
|
||||||
|
Some(cap_ref) => cap_ref,
|
||||||
|
None => {
|
||||||
|
dst.push(b'$');
|
||||||
|
replacement = &replacement[1..];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
replacement = &replacement[cap_ref.end..];
|
||||||
|
match cap_ref.cap {
|
||||||
|
Ref::Number(i) => append(i, dst),
|
||||||
|
Ref::Named(name) => {
|
||||||
|
if let Some(i) = name_to_index(name) {
|
||||||
|
append(i, dst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dst.extend(replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `CaptureRef` represents a reference to a capture group inside some text.
|
||||||
|
/// The reference is either a capture group name or a number.
|
||||||
|
///
|
||||||
|
/// It is also tagged with the position in the text immediately proceding the
|
||||||
|
/// capture reference.
|
||||||
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||||
|
struct CaptureRef<'a> {
|
||||||
|
cap: Ref<'a>,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A reference to a capture group in some text.
|
||||||
|
///
|
||||||
|
/// e.g., `$2`, `$foo`, `${foo}`.
|
||||||
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||||
|
enum Ref<'a> {
|
||||||
|
Named(&'a str),
|
||||||
|
Number(usize),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> From<&'a str> for Ref<'a> {
|
||||||
|
fn from(x: &'a str) -> Ref<'a> {
|
||||||
|
Ref::Named(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<usize> for Ref<'static> {
|
||||||
|
fn from(x: usize) -> Ref<'static> {
|
||||||
|
Ref::Number(x)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses a possible reference to a capture group name in the given text,
|
||||||
|
/// starting at the beginning of `replacement`.
|
||||||
|
///
|
||||||
|
/// If no such valid reference could be found, None is returned.
|
||||||
|
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef> {
|
||||||
|
let mut i = 0;
|
||||||
|
if replacement.len() <= 1 || replacement[0] != b'$' {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let mut brace = false;
|
||||||
|
i += 1;
|
||||||
|
if replacement[i] == b'{' {
|
||||||
|
brace = true;
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
let mut cap_end = i;
|
||||||
|
while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
|
||||||
|
cap_end += 1;
|
||||||
|
}
|
||||||
|
if cap_end == i {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// We just verified that the range 0..cap_end is valid ASCII, so it must
|
||||||
|
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
|
||||||
|
// check with an unchecked conversion or by parsing the number straight
|
||||||
|
// from &[u8].
|
||||||
|
let cap = str::from_utf8(&replacement[i..cap_end])
|
||||||
|
.expect("valid UTF-8 capture name");
|
||||||
|
if brace {
|
||||||
|
if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
cap_end += 1;
|
||||||
|
}
|
||||||
|
Some(CaptureRef {
|
||||||
|
cap: match cap.parse::<u32>() {
|
||||||
|
Ok(i) => Ref::Number(i as usize),
|
||||||
|
Err(_) => Ref::Named(cap),
|
||||||
|
},
|
||||||
|
end: cap_end,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the given byte is allowed in a capture name.
|
||||||
|
fn is_valid_cap_letter(b: &u8) -> bool {
|
||||||
|
match *b {
|
||||||
|
b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::{CaptureRef, find_cap_ref, interpolate};
|
||||||
|
|
||||||
|
macro_rules! find {
|
||||||
|
($name:ident, $text:expr) => {
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
assert_eq!(None, find_cap_ref($text.as_bytes()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
($name:ident, $text:expr, $capref:expr) => {
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! c {
|
||||||
|
($name_or_number:expr, $pos:expr) => {
|
||||||
|
CaptureRef { cap: $name_or_number.into(), end: $pos }
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
find!(find_cap_ref1, "$foo", c!("foo", 4));
|
||||||
|
find!(find_cap_ref2, "${foo}", c!("foo", 6));
|
||||||
|
find!(find_cap_ref3, "$0", c!(0, 2));
|
||||||
|
find!(find_cap_ref4, "$5", c!(5, 2));
|
||||||
|
find!(find_cap_ref5, "$10", c!(10, 3));
|
||||||
|
find!(find_cap_ref6, "$42a", c!("42a", 4));
|
||||||
|
find!(find_cap_ref7, "${42}a", c!(42, 5));
|
||||||
|
find!(find_cap_ref8, "${42");
|
||||||
|
find!(find_cap_ref9, "${42 ");
|
||||||
|
find!(find_cap_ref10, " $0 ");
|
||||||
|
find!(find_cap_ref11, "$");
|
||||||
|
find!(find_cap_ref12, " ");
|
||||||
|
find!(find_cap_ref13, "");
|
||||||
|
|
||||||
|
// A convenience routine for using interpolate's unwieldy but flexible API.
|
||||||
|
fn interpolate_string(
|
||||||
|
mut name_to_index: Vec<(&'static str, usize)>,
|
||||||
|
caps: Vec<&'static str>,
|
||||||
|
replacement: &str,
|
||||||
|
) -> String {
|
||||||
|
name_to_index.sort_by_key(|x| x.0);
|
||||||
|
|
||||||
|
let mut dst = vec![];
|
||||||
|
interpolate(
|
||||||
|
replacement.as_bytes(),
|
||||||
|
|i, dst| {
|
||||||
|
if let Some(&s) = caps.get(i) {
|
||||||
|
dst.extend(s.as_bytes());
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|name| -> Option<usize> {
|
||||||
|
name_to_index
|
||||||
|
.binary_search_by_key(&name, |x| x.0)
|
||||||
|
.ok()
|
||||||
|
.map(|i| name_to_index[i].1)
|
||||||
|
},
|
||||||
|
&mut dst,
|
||||||
|
);
|
||||||
|
String::from_utf8(dst).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
macro_rules! interp {
|
||||||
|
($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
|
||||||
|
#[test]
|
||||||
|
fn $name() {
|
||||||
|
assert_eq!($expected, interpolate_string($map, $caps, $hay));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp1,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test $foo test",
|
||||||
|
"test xxx test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp2,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test$footest",
|
||||||
|
"test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp3,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test${foo}test",
|
||||||
|
"testxxxtest",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp4,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test$2test",
|
||||||
|
"test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp5,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test${2}test",
|
||||||
|
"testxxxtest",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp6,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test $$foo test",
|
||||||
|
"test $foo test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp7,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"test $foo",
|
||||||
|
"test xxx",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp8,
|
||||||
|
vec![("foo", 2)],
|
||||||
|
vec!["", "", "xxx"],
|
||||||
|
"$foo test",
|
||||||
|
"xxx test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp9,
|
||||||
|
vec![("bar", 1), ("foo", 2)],
|
||||||
|
vec!["", "yyy", "xxx"],
|
||||||
|
"test $bar$foo",
|
||||||
|
"test yyyxxx",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp10,
|
||||||
|
vec![("bar", 1), ("foo", 2)],
|
||||||
|
vec!["", "yyy", "xxx"],
|
||||||
|
"test $ test",
|
||||||
|
"test $ test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp11,
|
||||||
|
vec![("bar", 1), ("foo", 2)],
|
||||||
|
vec!["", "yyy", "xxx"],
|
||||||
|
"test ${} test",
|
||||||
|
"test ${} test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp12,
|
||||||
|
vec![("bar", 1), ("foo", 2)],
|
||||||
|
vec!["", "yyy", "xxx"],
|
||||||
|
"test ${ } test",
|
||||||
|
"test ${ } test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp13,
|
||||||
|
vec![("bar", 1), ("foo", 2)],
|
||||||
|
vec!["", "yyy", "xxx"],
|
||||||
|
"test ${a b} test",
|
||||||
|
"test ${a b} test",
|
||||||
|
);
|
||||||
|
|
||||||
|
interp!(
|
||||||
|
interp14,
|
||||||
|
vec![("bar", 1), ("foo", 2)],
|
||||||
|
vec!["", "yyy", "xxx"],
|
||||||
|
"test ${a} test",
|
||||||
|
"test test",
|
||||||
|
);
|
||||||
|
}
|
||||||
1136
grep-matcher/src/lib.rs
Normal file
1136
grep-matcher/src/lib.rs
Normal file
File diff suppressed because it is too large
Load Diff
208
grep-matcher/tests/test_matcher.rs
Normal file
208
grep-matcher/tests/test_matcher.rs
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
use grep_matcher::{Captures, Match, Matcher};
|
||||||
|
use regex::bytes::Regex;
|
||||||
|
|
||||||
|
use util::{RegexMatcher, RegexMatcherNoCaps};
|
||||||
|
|
||||||
|
fn matcher(pattern: &str) -> RegexMatcher {
|
||||||
|
RegexMatcher::new(Regex::new(pattern).unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn matcher_no_caps(pattern: &str) -> RegexMatcherNoCaps {
|
||||||
|
RegexMatcherNoCaps(Regex::new(pattern).unwrap())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn m(start: usize, end: usize) -> Match {
|
||||||
|
Match::new(start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn find() {
|
||||||
|
let matcher = matcher(r"(\w+)\s+(\w+)");
|
||||||
|
assert_eq!(matcher.find(b" homer simpson ").unwrap(), Some(m(1, 14)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn find_iter() {
|
||||||
|
let matcher = matcher(r"(\w+)\s+(\w+)");
|
||||||
|
let mut matches = vec![];
|
||||||
|
matcher.find_iter(b"aa bb cc dd", |m| {
|
||||||
|
matches.push(m);
|
||||||
|
true
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(matches, vec![m(0, 5), m(6, 11)]);
|
||||||
|
|
||||||
|
// Test that find_iter respects short circuiting.
|
||||||
|
matches.clear();
|
||||||
|
matcher.find_iter(b"aa bb cc dd", |m| {
|
||||||
|
matches.push(m);
|
||||||
|
false
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(matches, vec![m(0, 5)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn try_find_iter() {
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
struct MyError;
|
||||||
|
|
||||||
|
let matcher = matcher(r"(\w+)\s+(\w+)");
|
||||||
|
let mut matches = vec![];
|
||||||
|
let err = matcher.try_find_iter(b"aa bb cc dd", |m| {
|
||||||
|
if matches.is_empty() {
|
||||||
|
matches.push(m);
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
Err(MyError)
|
||||||
|
}
|
||||||
|
}).unwrap().unwrap_err();
|
||||||
|
assert_eq!(matches, vec![m(0, 5)]);
|
||||||
|
assert_eq!(err, MyError);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn shortest_match() {
|
||||||
|
let matcher = matcher(r"a+");
|
||||||
|
// This tests that the default impl isn't doing anything smart, and simply
|
||||||
|
// defers to `find`.
|
||||||
|
assert_eq!(matcher.shortest_match(b"aaa").unwrap(), Some(3));
|
||||||
|
// The actual underlying regex is smarter.
|
||||||
|
assert_eq!(matcher.re.shortest_match(b"aaa"), Some(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn captures() {
|
||||||
|
let matcher = matcher(r"(?P<a>\w+)\s+(?P<b>\w+)");
|
||||||
|
assert_eq!(matcher.capture_count(), 3);
|
||||||
|
assert_eq!(matcher.capture_index("a"), Some(1));
|
||||||
|
assert_eq!(matcher.capture_index("b"), Some(2));
|
||||||
|
assert_eq!(matcher.capture_index("nada"), None);
|
||||||
|
|
||||||
|
let mut caps = matcher.new_captures().unwrap();
|
||||||
|
assert!(matcher.captures(b" homer simpson ", &mut caps).unwrap());
|
||||||
|
assert_eq!(caps.get(0), Some(m(1, 14)));
|
||||||
|
assert_eq!(caps.get(1), Some(m(1, 6)));
|
||||||
|
assert_eq!(caps.get(2), Some(m(7, 14)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn captures_iter() {
|
||||||
|
let matcher = matcher(r"(?P<a>\w+)\s+(?P<b>\w+)");
|
||||||
|
let mut caps = matcher.new_captures().unwrap();
|
||||||
|
let mut matches = vec![];
|
||||||
|
matcher.captures_iter(b"aa bb cc dd", &mut caps, |caps| {
|
||||||
|
matches.push(caps.get(0).unwrap());
|
||||||
|
matches.push(caps.get(1).unwrap());
|
||||||
|
matches.push(caps.get(2).unwrap());
|
||||||
|
true
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(matches, vec![
|
||||||
|
m(0, 5), m(0, 2), m(3, 5),
|
||||||
|
m(6, 11), m(6, 8), m(9, 11),
|
||||||
|
]);
|
||||||
|
|
||||||
|
// Test that captures_iter respects short circuiting.
|
||||||
|
matches.clear();
|
||||||
|
matcher.captures_iter(b"aa bb cc dd", &mut caps, |caps| {
|
||||||
|
matches.push(caps.get(0).unwrap());
|
||||||
|
matches.push(caps.get(1).unwrap());
|
||||||
|
matches.push(caps.get(2).unwrap());
|
||||||
|
false
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(matches, vec![
|
||||||
|
m(0, 5), m(0, 2), m(3, 5),
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn try_captures_iter() {
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
struct MyError;
|
||||||
|
|
||||||
|
let matcher = matcher(r"(?P<a>\w+)\s+(?P<b>\w+)");
|
||||||
|
let mut caps = matcher.new_captures().unwrap();
|
||||||
|
let mut matches = vec![];
|
||||||
|
let err = matcher.try_captures_iter(b"aa bb cc dd", &mut caps, |caps| {
|
||||||
|
if matches.is_empty() {
|
||||||
|
matches.push(caps.get(0).unwrap());
|
||||||
|
matches.push(caps.get(1).unwrap());
|
||||||
|
matches.push(caps.get(2).unwrap());
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
Err(MyError)
|
||||||
|
}
|
||||||
|
}).unwrap().unwrap_err();
|
||||||
|
assert_eq!(matches, vec![m(0, 5), m(0, 2), m(3, 5)]);
|
||||||
|
assert_eq!(err, MyError);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that our default impls for capturing are correct. Namely, when
|
||||||
|
// capturing isn't supported by the underlying matcher, then all of the
|
||||||
|
// various capturing related APIs fail fast.
|
||||||
|
#[test]
|
||||||
|
fn no_captures() {
|
||||||
|
let matcher = matcher_no_caps(r"(?P<a>\w+)\s+(?P<b>\w+)");
|
||||||
|
assert_eq!(matcher.capture_count(), 0);
|
||||||
|
assert_eq!(matcher.capture_index("a"), None);
|
||||||
|
assert_eq!(matcher.capture_index("b"), None);
|
||||||
|
assert_eq!(matcher.capture_index("nada"), None);
|
||||||
|
|
||||||
|
let mut caps = matcher.new_captures().unwrap();
|
||||||
|
assert!(!matcher.captures(b"homer simpson", &mut caps).unwrap());
|
||||||
|
|
||||||
|
let mut called = false;
|
||||||
|
matcher.captures_iter(b"homer simpson", &mut caps, |_| {
|
||||||
|
called = true;
|
||||||
|
true
|
||||||
|
}).unwrap();
|
||||||
|
assert!(!called);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn replace() {
|
||||||
|
let matcher = matcher(r"(\w+)\s+(\w+)");
|
||||||
|
let mut dst = vec![];
|
||||||
|
matcher.replace(b"aa bb cc dd", &mut dst, |_, dst| {
|
||||||
|
dst.push(b'z');
|
||||||
|
true
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(dst, b"z z");
|
||||||
|
|
||||||
|
// Test that replacements respect short circuiting.
|
||||||
|
dst.clear();
|
||||||
|
matcher.replace(b"aa bb cc dd", &mut dst, |_, dst| {
|
||||||
|
dst.push(b'z');
|
||||||
|
false
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(dst, b"z cc dd");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn replace_with_captures() {
|
||||||
|
let matcher = matcher(r"(\w+)\s+(\w+)");
|
||||||
|
let haystack = b"aa bb cc dd";
|
||||||
|
let mut caps = matcher.new_captures().unwrap();
|
||||||
|
let mut dst = vec![];
|
||||||
|
matcher.replace_with_captures(haystack, &mut caps, &mut dst, |caps, dst| {
|
||||||
|
caps.interpolate(
|
||||||
|
|name| matcher.capture_index(name),
|
||||||
|
haystack,
|
||||||
|
b"$2 $1",
|
||||||
|
dst,
|
||||||
|
);
|
||||||
|
true
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(dst, b"bb aa dd cc");
|
||||||
|
|
||||||
|
// Test that replacements respect short circuiting.
|
||||||
|
dst.clear();
|
||||||
|
matcher.replace_with_captures(haystack, &mut caps, &mut dst, |caps, dst| {
|
||||||
|
caps.interpolate(
|
||||||
|
|name| matcher.capture_index(name),
|
||||||
|
haystack,
|
||||||
|
b"$2 $1",
|
||||||
|
dst,
|
||||||
|
);
|
||||||
|
false
|
||||||
|
}).unwrap();
|
||||||
|
assert_eq!(dst, b"bb aa cc dd");
|
||||||
|
}
|
||||||
6
grep-matcher/tests/tests.rs
Normal file
6
grep-matcher/tests/tests.rs
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
extern crate grep_matcher;
|
||||||
|
extern crate regex;
|
||||||
|
|
||||||
|
mod util;
|
||||||
|
|
||||||
|
mod test_matcher;
|
||||||
104
grep-matcher/tests/util.rs
Normal file
104
grep-matcher/tests/util.rs
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::result;
|
||||||
|
|
||||||
|
use grep_matcher::{Captures, Match, Matcher, NoCaptures, NoError};
|
||||||
|
use regex::bytes::{CaptureLocations, Regex};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct RegexMatcher {
|
||||||
|
pub re: Regex,
|
||||||
|
pub names: HashMap<String, usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcher {
|
||||||
|
pub fn new(re: Regex) -> RegexMatcher {
|
||||||
|
let mut names = HashMap::new();
|
||||||
|
for (i, optional_name) in re.capture_names().enumerate() {
|
||||||
|
if let Some(name) = optional_name {
|
||||||
|
names.insert(name.to_string(), i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
RegexMatcher {
|
||||||
|
re: re,
|
||||||
|
names: names,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type Result<T> = result::Result<T, NoError>;
|
||||||
|
|
||||||
|
impl Matcher for RegexMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>> {
|
||||||
|
Ok(self.re
|
||||||
|
.find_at(haystack, at)
|
||||||
|
.map(|m| Match::new(m.start(), m.end())))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures> {
|
||||||
|
Ok(RegexCaptures(self.re.capture_locations()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool> {
|
||||||
|
Ok(self.re.captures_read_at(&mut caps.0, haystack, at).is_some())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
self.re.captures_len()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
self.names.get(name).map(|i| *i)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We purposely don't implement any other methods, so that we test the
|
||||||
|
// default impls. The "real" Regex impl for Matcher provides a few more
|
||||||
|
// impls. e.g., Its `find_iter` impl is faster than what we can do here,
|
||||||
|
// since the regex crate avoids synchronization overhead.
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct RegexMatcherNoCaps(pub Regex);
|
||||||
|
|
||||||
|
impl Matcher for RegexMatcherNoCaps {
|
||||||
|
type Captures = NoCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>> {
|
||||||
|
Ok(self.0
|
||||||
|
.find_at(haystack, at)
|
||||||
|
.map(|m| Match::new(m.start(), m.end())))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<NoCaptures> {
|
||||||
|
Ok(NoCaptures::new())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexCaptures(CaptureLocations);
|
||||||
|
|
||||||
|
impl Captures for RegexCaptures {
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.0.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get(&self, i: usize) -> Option<Match> {
|
||||||
|
self.0.pos(i).map(|(s, e)| Match::new(s, e))
|
||||||
|
}
|
||||||
|
}
|
||||||
17
grep-pcre2/Cargo.toml
Normal file
17
grep-pcre2/Cargo.toml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[package]
|
||||||
|
name = "grep-pcre2"
|
||||||
|
version = "0.1.2" #:version
|
||||||
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
|
description = """
|
||||||
|
Use PCRE2 with the 'grep' crate.
|
||||||
|
"""
|
||||||
|
documentation = "https://docs.rs/grep-pcre2"
|
||||||
|
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
repository = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["regex", "grep", "pcre", "backreference", "look"]
|
||||||
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
|
pcre2 = "0.1.1"
|
||||||
21
grep-pcre2/LICENSE-MIT
Normal file
21
grep-pcre2/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Andrew Gallant
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
39
grep-pcre2/README.md
Normal file
39
grep-pcre2/README.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
grep-pcre2
|
||||||
|
----------
|
||||||
|
The `grep-pcre2` crate provides an implementation of the `Matcher` trait from
|
||||||
|
the `grep-matcher` crate. This implementation permits PCRE2 to be used in the
|
||||||
|
`grep` crate for fast line oriented searching.
|
||||||
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep-pcre2)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep-pcre2](https://docs.rs/grep-pcre2)
|
||||||
|
|
||||||
|
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||||
|
should prefer the facade defined in the
|
||||||
|
[`grep`](https://docs.rs/grep)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
If you're looking to just use PCRE2 from Rust, then you probably want the
|
||||||
|
[`pcre2`](https://docs.rs/pcre2)
|
||||||
|
crate, which provide high level safe bindings to PCRE2.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep-pcre2 = "0.1"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep_pcre2;
|
||||||
|
```
|
||||||
24
grep-pcre2/UNLICENSE
Normal file
24
grep-pcre2/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
||||||
59
grep-pcre2/src/error.rs
Normal file
59
grep-pcre2/src/error.rs
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
use std::error;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
/// An error that can occur in this crate.
|
||||||
|
///
|
||||||
|
/// Generally, this error corresponds to problems building a regular
|
||||||
|
/// expression, whether it's in parsing, compilation or a problem with
|
||||||
|
/// guaranteeing a configured optimization.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Error {
|
||||||
|
kind: ErrorKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Error {
|
||||||
|
pub(crate) fn regex<E: error::Error>(err: E) -> Error {
|
||||||
|
Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the kind of this error.
|
||||||
|
pub fn kind(&self) -> &ErrorKind {
|
||||||
|
&self.kind
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The kind of an error that can occur.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum ErrorKind {
|
||||||
|
/// An error that occurred as a result of parsing a regular expression.
|
||||||
|
/// This can be a syntax error or an error that results from attempting to
|
||||||
|
/// compile a regular expression that is too big.
|
||||||
|
///
|
||||||
|
/// The string here is the underlying error converted to a string.
|
||||||
|
Regex(String),
|
||||||
|
/// Hints that destructuring should not be exhaustive.
|
||||||
|
///
|
||||||
|
/// This enum may grow additional variants, so this makes sure clients
|
||||||
|
/// don't count on exhaustive matching. (Otherwise, adding a new variant
|
||||||
|
/// could break existing code.)
|
||||||
|
#[doc(hidden)]
|
||||||
|
__Nonexhaustive,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for Error {
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
match self.kind {
|
||||||
|
ErrorKind::Regex(_) => "regex error",
|
||||||
|
ErrorKind::__Nonexhaustive => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Error {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
match self.kind {
|
||||||
|
ErrorKind::Regex(ref s) => write!(f, "{}", s),
|
||||||
|
ErrorKind::__Nonexhaustive => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
15
grep-pcre2/src/lib.rs
Normal file
15
grep-pcre2/src/lib.rs
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
/*!
|
||||||
|
An implementation of `grep-matcher`'s `Matcher` trait for
|
||||||
|
[PCRE2](https://www.pcre.org/).
|
||||||
|
*/
|
||||||
|
|
||||||
|
#![deny(missing_docs)]
|
||||||
|
|
||||||
|
extern crate grep_matcher;
|
||||||
|
extern crate pcre2;
|
||||||
|
|
||||||
|
pub use error::{Error, ErrorKind};
|
||||||
|
pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
|
||||||
|
|
||||||
|
mod error;
|
||||||
|
mod matcher;
|
||||||
443
grep-pcre2/src/matcher.rs
Normal file
443
grep-pcre2/src/matcher.rs
Normal file
@@ -0,0 +1,443 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use grep_matcher::{Captures, Match, Matcher};
|
||||||
|
use pcre2::bytes::{CaptureLocations, Regex, RegexBuilder};
|
||||||
|
|
||||||
|
use error::Error;
|
||||||
|
|
||||||
|
/// A builder for configuring the compilation of a PCRE2 regex.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexMatcherBuilder {
|
||||||
|
builder: RegexBuilder,
|
||||||
|
case_smart: bool,
|
||||||
|
word: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcherBuilder {
|
||||||
|
/// Create a new matcher builder with a default configuration.
|
||||||
|
pub fn new() -> RegexMatcherBuilder {
|
||||||
|
RegexMatcherBuilder {
|
||||||
|
builder: RegexBuilder::new(),
|
||||||
|
case_smart: false,
|
||||||
|
word: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compile the given pattern into a PCRE matcher using the current
|
||||||
|
/// configuration.
|
||||||
|
///
|
||||||
|
/// If there was a problem compiling the pattern, then an error is
|
||||||
|
/// returned.
|
||||||
|
pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
|
||||||
|
let mut builder = self.builder.clone();
|
||||||
|
if self.case_smart && !has_uppercase_literal(pattern) {
|
||||||
|
builder.caseless(true);
|
||||||
|
}
|
||||||
|
let res =
|
||||||
|
if self.word {
|
||||||
|
let pattern = format!(r"(?<!\w)(?:{})(?!\w)", pattern);
|
||||||
|
builder.build(&pattern)
|
||||||
|
} else {
|
||||||
|
builder.build(pattern)
|
||||||
|
};
|
||||||
|
res.map_err(Error::regex).map(|regex| {
|
||||||
|
let mut names = HashMap::new();
|
||||||
|
for (i, name) in regex.capture_names().iter().enumerate() {
|
||||||
|
if let Some(ref name) = *name {
|
||||||
|
names.insert(name.to_string(), i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
RegexMatcher { regex, names }
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enables case insensitive matching.
|
||||||
|
///
|
||||||
|
/// If the `utf` option is also set, then Unicode case folding is used
|
||||||
|
/// to determine case insensitivity. When the `utf` option is not set,
|
||||||
|
/// then only standard ASCII case insensitivity is considered.
|
||||||
|
///
|
||||||
|
/// This option corresponds to the `i` flag.
|
||||||
|
pub fn caseless(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.caseless(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable "smart case" or not.
|
||||||
|
///
|
||||||
|
/// When smart case is enabled, the builder will automatically enable
|
||||||
|
/// case insensitive matching based on how the pattern is written. Namely,
|
||||||
|
/// case insensitive mode is enabled when both of the following things
|
||||||
|
/// are believed to be true:
|
||||||
|
///
|
||||||
|
/// 1. The pattern contains at least one literal character. For example,
|
||||||
|
/// `a\w` contains a literal (`a`) but `\w` does not.
|
||||||
|
/// 2. Of the literals in the pattern, none of them are considered to be
|
||||||
|
/// uppercase according to Unicode. For example, `foo\pL` has no
|
||||||
|
/// uppercase literals but `Foo\pL` does.
|
||||||
|
///
|
||||||
|
/// Note that the implementation of this is not perfect. Namely, `\p{Ll}`
|
||||||
|
/// will prevent case insensitive matching even though it is part of a meta
|
||||||
|
/// sequence. This bug will probably never be fixed.
|
||||||
|
pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.case_smart = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enables "dot all" matching.
|
||||||
|
///
|
||||||
|
/// When enabled, the `.` metacharacter in the pattern matches any
|
||||||
|
/// character, include `\n`. When disabled (the default), `.` will match
|
||||||
|
/// any character except for `\n`.
|
||||||
|
///
|
||||||
|
/// This option corresponds to the `s` flag.
|
||||||
|
pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.dotall(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable "extended" mode in the pattern, where whitespace is ignored.
|
||||||
|
///
|
||||||
|
/// This option corresponds to the `x` flag.
|
||||||
|
pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.extended(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable multiline matching mode.
|
||||||
|
///
|
||||||
|
/// When enabled, the `^` and `$` anchors will match both at the beginning
|
||||||
|
/// and end of a subject string, in addition to matching at the start of
|
||||||
|
/// a line and the end of a line. When disabled, the `^` and `$` anchors
|
||||||
|
/// will only match at the beginning and end of a subject string.
|
||||||
|
///
|
||||||
|
/// This option corresponds to the `m` flag.
|
||||||
|
pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.multi_line(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable matching of CRLF as a line terminator.
|
||||||
|
///
|
||||||
|
/// When enabled, anchors such as `^` and `$` will match any of the
|
||||||
|
/// following as a line terminator: `\r`, `\n` or `\r\n`.
|
||||||
|
///
|
||||||
|
/// This is disabled by default, in which case, only `\n` is recognized as
|
||||||
|
/// a line terminator.
|
||||||
|
pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.crlf(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Require that all matches occur on word boundaries.
|
||||||
|
///
|
||||||
|
/// Enabling this option is subtly different than putting `\b` assertions
|
||||||
|
/// on both sides of your pattern. In particular, a `\b` assertion requires
|
||||||
|
/// that one side of it match a word character while the other match a
|
||||||
|
/// non-word character. This option, in contrast, merely requires that
|
||||||
|
/// one side match a non-word character.
|
||||||
|
///
|
||||||
|
/// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
|
||||||
|
/// word character. However, `-2` with this `word` option enabled will
|
||||||
|
/// match the `-2` in `foo -2 bar`.
|
||||||
|
pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.word = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable Unicode matching mode.
|
||||||
|
///
|
||||||
|
/// When enabled, the following patterns become Unicode aware: `\b`, `\B`,
|
||||||
|
/// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`.
|
||||||
|
///
|
||||||
|
/// When set, this implies UTF matching mode. It is not possible to enable
|
||||||
|
/// Unicode matching mode without enabling UTF matching mode.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.ucp(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable UTF matching mode.
|
||||||
|
///
|
||||||
|
/// When enabled, characters are treated as sequences of code units that
|
||||||
|
/// make up a single codepoint instead of as single bytes. For example,
|
||||||
|
/// this will cause `.` to match any single UTF-8 encoded codepoint, where
|
||||||
|
/// as when this is disabled, `.` will any single byte (except for `\n` in
|
||||||
|
/// both cases, unless "dot all" mode is enabled).
|
||||||
|
///
|
||||||
|
/// Note that when UTF matching mode is enabled, every search performed
|
||||||
|
/// will do a UTF-8 validation check, which can impact performance. The
|
||||||
|
/// UTF-8 check can be disabled via the `disable_utf_check` option, but it
|
||||||
|
/// is undefined behavior to enable UTF matching mode and search invalid
|
||||||
|
/// UTF-8.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.utf(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When UTF matching mode is enabled, this will disable the UTF checking
|
||||||
|
/// that PCRE2 will normally perform automatically. If UTF matching mode
|
||||||
|
/// is not enabled, then this has no effect.
|
||||||
|
///
|
||||||
|
/// UTF checking is enabled by default when UTF matching mode is enabled.
|
||||||
|
/// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2
|
||||||
|
/// will return an error if you attempt to search a subject string that is
|
||||||
|
/// not valid UTF-8.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// It is undefined behavior to disable the UTF check in UTF matching mode
|
||||||
|
/// and search a subject string that is not valid UTF-8. When the UTF check
|
||||||
|
/// is disabled, callers must guarantee that the subject string is valid
|
||||||
|
/// UTF-8.
|
||||||
|
pub unsafe fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.disable_utf_check();
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable PCRE2's JIT and return an error if it's not available.
|
||||||
|
///
|
||||||
|
/// This generally speeds up matching quite a bit. The downside is that it
|
||||||
|
/// can increase the time it takes to compile a pattern.
|
||||||
|
///
|
||||||
|
/// If the JIT isn't available or if JIT compilation returns an error, then
|
||||||
|
/// regex compilation will fail with the corresponding error.
|
||||||
|
///
|
||||||
|
/// This is disabled by default, and always overrides `jit_if_available`.
|
||||||
|
pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.jit(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Enable PCRE2's JIT if it's available.
|
||||||
|
///
|
||||||
|
/// This generally speeds up matching quite a bit. The downside is that it
|
||||||
|
/// can increase the time it takes to compile a pattern.
|
||||||
|
///
|
||||||
|
/// If the JIT isn't available or if JIT compilation returns an error,
|
||||||
|
/// then a debug message with the error will be emitted and the regex will
|
||||||
|
/// otherwise silently fall back to non-JIT matching.
|
||||||
|
///
|
||||||
|
/// This is disabled by default, and always overrides `jit`.
|
||||||
|
pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.builder.jit_if_available(yes);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An implementation of the `Matcher` trait using PCRE2.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexMatcher {
|
||||||
|
regex: Regex,
|
||||||
|
names: HashMap<String, usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcher {
|
||||||
|
/// Create a new matcher from the given pattern using the default
|
||||||
|
/// configuration.
|
||||||
|
pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
|
||||||
|
RegexMatcherBuilder::new().build(pattern)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher for RegexMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = Error;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, Error> {
|
||||||
|
Ok(self.regex
|
||||||
|
.find_at(haystack, at)
|
||||||
|
.map_err(Error::regex)?
|
||||||
|
.map(|m| Match::new(m.start(), m.end())))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures, Error> {
|
||||||
|
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
self.regex.captures_len()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
self.names.get(name).map(|i| *i)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_find_iter<F, E>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
mut matched: F,
|
||||||
|
) -> Result<Result<(), E>, Error>
|
||||||
|
where F: FnMut(Match) -> Result<bool, E>
|
||||||
|
{
|
||||||
|
for result in self.regex.find_iter(haystack) {
|
||||||
|
let m = result.map_err(Error::regex)?;
|
||||||
|
match matched(Match::new(m.start(), m.end())) {
|
||||||
|
Ok(true) => continue,
|
||||||
|
Ok(false) => return Ok(Ok(())),
|
||||||
|
Err(err) => return Ok(Err(err)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Ok(()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, Error> {
|
||||||
|
Ok(self.regex
|
||||||
|
.captures_read_at(&mut caps.locs, haystack, at)
|
||||||
|
.map_err(Error::regex)?
|
||||||
|
.is_some())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents the match offsets of each capturing group in a match.
|
||||||
|
///
|
||||||
|
/// The first, or `0`th capture group, always corresponds to the entire match
|
||||||
|
/// and is guaranteed to be present when a match occurs. The next capture
|
||||||
|
/// group, at index `1`, corresponds to the first capturing group in the regex,
|
||||||
|
/// ordered by the position at which the left opening parenthesis occurs.
|
||||||
|
///
|
||||||
|
/// Note that not all capturing groups are guaranteed to be present in a match.
|
||||||
|
/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
|
||||||
|
/// or `bar` will ever be set in any given match.
|
||||||
|
///
|
||||||
|
/// In order to access a capture group by name, you'll need to first find the
|
||||||
|
/// index of the group using the corresponding matcher's `capture_index`
|
||||||
|
/// method, and then use that index with `RegexCaptures::get`.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexCaptures {
|
||||||
|
/// Where the locations are stored.
|
||||||
|
locs: CaptureLocations,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Captures for RegexCaptures {
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.locs.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get(&self, i: usize) -> Option<Match> {
|
||||||
|
self.locs.get(i).map(|(s, e)| Match::new(s, e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexCaptures {
|
||||||
|
pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
|
||||||
|
RegexCaptures { locs }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Determine whether the pattern contains an uppercase character which should
|
||||||
|
/// negate the effect of the smart-case option.
|
||||||
|
///
|
||||||
|
/// Ideally we would be able to check the AST in order to correctly handle
|
||||||
|
/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly
|
||||||
|
/// cased), but PCRE doesn't expose enough details for that kind of analysis.
|
||||||
|
/// For now, our 'good enough' solution is to simply perform a semi-naïve
|
||||||
|
/// scan of the input pattern and ignore all characters following a '\'. The
|
||||||
|
/// This at least lets us support the most common cases, like 'foo\w' and
|
||||||
|
/// 'foo\S', in an intuitive manner.
|
||||||
|
fn has_uppercase_literal(pattern: &str) -> bool {
|
||||||
|
let mut chars = pattern.chars();
|
||||||
|
while let Some(c) = chars.next() {
|
||||||
|
if c == '\\' {
|
||||||
|
chars.next();
|
||||||
|
} else if c.is_uppercase() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use grep_matcher::{LineMatchKind, Matcher};
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// Test that enabling word matches does the right thing and demonstrate
|
||||||
|
// the difference between it and surrounding the regex in `\b`.
|
||||||
|
#[test]
|
||||||
|
fn word() {
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.word(true)
|
||||||
|
.build(r"-2")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc -2 foo").unwrap());
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.word(false)
|
||||||
|
.build(r"\b-2\b")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"abc -2 foo").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that enabling CRLF permits `$` to match at the end of a line.
|
||||||
|
#[test]
|
||||||
|
fn line_terminator_crlf() {
|
||||||
|
// Test normal use of `$` with a `\n` line terminator.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.multi_line(true)
|
||||||
|
.build(r"abc$")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc\n").unwrap());
|
||||||
|
|
||||||
|
// Test that `$` doesn't match at `\r\n` boundary normally.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.multi_line(true)
|
||||||
|
.build(r"abc$")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"abc\r\n").unwrap());
|
||||||
|
|
||||||
|
// Now check the CRLF handling.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.multi_line(true)
|
||||||
|
.crlf(true)
|
||||||
|
.build(r"abc$")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc\r\n").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that smart case works.
|
||||||
|
#[test]
|
||||||
|
fn case_smart() {
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.case_smart(true)
|
||||||
|
.build(r"abc")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"ABC").unwrap());
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.case_smart(true)
|
||||||
|
.build(r"aBc")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"ABC").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that finding candidate lines works as expected.
|
||||||
|
#[test]
|
||||||
|
fn candidate_lines() {
|
||||||
|
fn is_confirmed(m: LineMatchKind) -> bool {
|
||||||
|
match m {
|
||||||
|
LineMatchKind::Confirmed(_) => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.build(r"\wfoo\s")
|
||||||
|
.unwrap();
|
||||||
|
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
|
||||||
|
assert!(is_confirmed(m));
|
||||||
|
}
|
||||||
|
}
|
||||||
30
grep-printer/Cargo.toml
Normal file
30
grep-printer/Cargo.toml
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
[package]
|
||||||
|
name = "grep-printer"
|
||||||
|
version = "0.1.1" #:version
|
||||||
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
|
description = """
|
||||||
|
An implementation of the grep crate's Sink trait that provides standard
|
||||||
|
printing of search results, similar to grep itself.
|
||||||
|
"""
|
||||||
|
documentation = "https://docs.rs/grep-printer"
|
||||||
|
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
repository = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["grep", "pattern", "print", "printer", "sink"]
|
||||||
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["serde1"]
|
||||||
|
serde1 = ["base64", "serde", "serde_derive", "serde_json"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
base64 = { version = "0.10.0", optional = true }
|
||||||
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
|
grep-searcher = { version = "0.1.1", path = "../grep-searcher" }
|
||||||
|
termcolor = "1.0.4"
|
||||||
|
serde = { version = "1.0.77", optional = true }
|
||||||
|
serde_derive = { version = "1.0.77", optional = true }
|
||||||
|
serde_json = { version = "1.0.27", optional = true }
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
grep-regex = { version = "0.1.1", path = "../grep-regex" }
|
||||||
21
grep-printer/LICENSE-MIT
Normal file
21
grep-printer/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Andrew Gallant
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
35
grep-printer/README.md
Normal file
35
grep-printer/README.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
grep-printer
|
||||||
|
------------
|
||||||
|
Print results from line oriented searching in a human readable, aggregate or
|
||||||
|
JSON Lines format.
|
||||||
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep-printer)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep-printer](https://docs.rs/grep-printer)
|
||||||
|
|
||||||
|
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||||
|
should prefer the facade defined in the
|
||||||
|
[`grep`](https://docs.rs/grep)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep-printer = "0.1"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep_printer;
|
||||||
|
```
|
||||||
24
grep-printer/UNLICENSE
Normal file
24
grep-printer/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
||||||
394
grep-printer/src/color.rs
Normal file
394
grep-printer/src/color.rs
Normal file
@@ -0,0 +1,394 @@
|
|||||||
|
use std::error;
|
||||||
|
use std::fmt;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use termcolor::{Color, ColorSpec, ParseColorError};
|
||||||
|
|
||||||
|
/// Returns a default set of color specifications.
|
||||||
|
///
|
||||||
|
/// This may change over time, but the color choices are meant to be fairly
|
||||||
|
/// conservative that work across terminal themes.
|
||||||
|
///
|
||||||
|
/// Additional color specifications can be added to the list returned. More
|
||||||
|
/// recently added specifications override previously added specifications.
|
||||||
|
pub fn default_color_specs() -> Vec<UserColorSpec> {
|
||||||
|
vec![
|
||||||
|
#[cfg(unix)]
|
||||||
|
"path:fg:magenta".parse().unwrap(),
|
||||||
|
#[cfg(windows)]
|
||||||
|
"path:fg:cyan".parse().unwrap(),
|
||||||
|
"line:fg:green".parse().unwrap(),
|
||||||
|
"match:fg:red".parse().unwrap(),
|
||||||
|
"match:style:bold".parse().unwrap(),
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An error that can occur when parsing color specifications.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
pub enum ColorError {
|
||||||
|
/// This occurs when an unrecognized output type is used.
|
||||||
|
UnrecognizedOutType(String),
|
||||||
|
/// This occurs when an unrecognized spec type is used.
|
||||||
|
UnrecognizedSpecType(String),
|
||||||
|
/// This occurs when an unrecognized color name is used.
|
||||||
|
UnrecognizedColor(String, String),
|
||||||
|
/// This occurs when an unrecognized style attribute is used.
|
||||||
|
UnrecognizedStyle(String),
|
||||||
|
/// This occurs when the format of a color specification is invalid.
|
||||||
|
InvalidFormat(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for ColorError {
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
match *self {
|
||||||
|
ColorError::UnrecognizedOutType(_) => "unrecognized output type",
|
||||||
|
ColorError::UnrecognizedSpecType(_) => "unrecognized spec type",
|
||||||
|
ColorError::UnrecognizedColor(_, _) => "unrecognized color name",
|
||||||
|
ColorError::UnrecognizedStyle(_) => "unrecognized style attribute",
|
||||||
|
ColorError::InvalidFormat(_) => "invalid color spec",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColorError {
|
||||||
|
fn from_parse_error(err: ParseColorError) -> ColorError {
|
||||||
|
ColorError::UnrecognizedColor(
|
||||||
|
err.invalid().to_string(),
|
||||||
|
err.to_string(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for ColorError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
match *self {
|
||||||
|
ColorError::UnrecognizedOutType(ref name) => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"unrecognized output type '{}'. Choose from: \
|
||||||
|
path, line, column, match.",
|
||||||
|
name,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ColorError::UnrecognizedSpecType(ref name) => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"unrecognized spec type '{}'. Choose from: \
|
||||||
|
fg, bg, style, none.",
|
||||||
|
name,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ColorError::UnrecognizedColor(_, ref msg) => {
|
||||||
|
write!(f, "{}", msg)
|
||||||
|
}
|
||||||
|
ColorError::UnrecognizedStyle(ref name) => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"unrecognized style attribute '{}'. Choose from: \
|
||||||
|
nobold, bold, nointense, intense, nounderline, \
|
||||||
|
underline.",
|
||||||
|
name,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ColorError::InvalidFormat(ref original) => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"invalid color spec format: '{}'. Valid format \
|
||||||
|
is '(path|line|column|match):(fg|bg|style):(value)'.",
|
||||||
|
original,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A merged set of color specifications.
|
||||||
|
///
|
||||||
|
/// This set of color specifications represents the various color types that
|
||||||
|
/// are supported by the printers in this crate. A set of color specifications
|
||||||
|
/// can be created from a sequence of
|
||||||
|
/// [`UserColorSpec`s](struct.UserColorSpec.html).
|
||||||
|
#[derive(Clone, Debug, Default, Eq, PartialEq)]
|
||||||
|
pub struct ColorSpecs {
|
||||||
|
path: ColorSpec,
|
||||||
|
line: ColorSpec,
|
||||||
|
column: ColorSpec,
|
||||||
|
matched: ColorSpec,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A single color specification provided by the user.
|
||||||
|
///
|
||||||
|
/// ## Format
|
||||||
|
///
|
||||||
|
/// The format of a `Spec` is a triple: `{type}:{attribute}:{value}`. Each
|
||||||
|
/// component is defined as follows:
|
||||||
|
///
|
||||||
|
/// * `{type}` can be one of `path`, `line`, `column` or `match`.
|
||||||
|
/// * `{attribute}` can be one of `fg`, `bg` or `style`. `{attribute}` may also
|
||||||
|
/// be the special value `none`, in which case, `{value}` can be omitted.
|
||||||
|
/// * `{value}` is either a color name (for `fg`/`bg`) or a style instruction.
|
||||||
|
///
|
||||||
|
/// `{type}` controls which part of the output should be styled.
|
||||||
|
///
|
||||||
|
/// When `{attribute}` is `none`, then this should cause any existing style
|
||||||
|
/// settings to be cleared for the specified `type`.
|
||||||
|
///
|
||||||
|
/// `{value}` should be a color when `{attribute}` is `fg` or `bg`, or it
|
||||||
|
/// should be a style instruction when `{attribute}` is `style`. When
|
||||||
|
/// `{attribute}` is `none`, `{value}` must be omitted.
|
||||||
|
///
|
||||||
|
/// Valid colors are `black`, `blue`, `green`, `red`, `cyan`, `magenta`,
|
||||||
|
/// `yellow`, `white`. Extended colors can also be specified, and are formatted
|
||||||
|
/// as `x` (for 256-bit colors) or `x,x,x` (for 24-bit true color), where
|
||||||
|
/// `x` is a number between 0 and 255 inclusive. `x` may be given as a normal
|
||||||
|
/// decimal number of a hexadecimal number, where the latter is prefixed by
|
||||||
|
/// `0x`.
|
||||||
|
///
|
||||||
|
/// Valid style instructions are `nobold`, `bold`, `intense`, `nointense`,
|
||||||
|
/// `underline`, `nounderline`.
|
||||||
|
///
|
||||||
|
/// ## Example
|
||||||
|
///
|
||||||
|
/// The standard way to build a `UserColorSpec` is to parse it from a string.
|
||||||
|
/// Once multiple `UserColorSpec`s have been constructed, they can be provided
|
||||||
|
/// to the standard printer where they will automatically be applied to the
|
||||||
|
/// output.
|
||||||
|
///
|
||||||
|
/// A `UserColorSpec` can also be converted to a `termcolor::ColorSpec`:
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// extern crate grep_printer;
|
||||||
|
/// extern crate termcolor;
|
||||||
|
///
|
||||||
|
/// # fn main() {
|
||||||
|
/// use termcolor::{Color, ColorSpec};
|
||||||
|
/// use grep_printer::UserColorSpec;
|
||||||
|
///
|
||||||
|
/// let user_spec1: UserColorSpec = "path:fg:blue".parse().unwrap();
|
||||||
|
/// let user_spec2: UserColorSpec = "match:bg:0xff,0x7f,0x00".parse().unwrap();
|
||||||
|
///
|
||||||
|
/// let spec1 = user_spec1.to_color_spec();
|
||||||
|
/// let spec2 = user_spec2.to_color_spec();
|
||||||
|
///
|
||||||
|
/// assert_eq!(spec1.fg(), Some(&Color::Blue));
|
||||||
|
/// assert_eq!(spec2.bg(), Some(&Color::Rgb(0xFF, 0x7F, 0x00)));
|
||||||
|
/// # }
|
||||||
|
/// ```
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
pub struct UserColorSpec {
|
||||||
|
ty: OutType,
|
||||||
|
value: SpecValue,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UserColorSpec {
|
||||||
|
/// Convert this user provided color specification to a specification that
|
||||||
|
/// can be used with `termcolor`. This drops the type of this specification
|
||||||
|
/// (where the type indicates where the color is applied in the standard
|
||||||
|
/// printer, e.g., to the file path or the line numbers, etc.).
|
||||||
|
pub fn to_color_spec(&self) -> ColorSpec {
|
||||||
|
let mut spec = ColorSpec::default();
|
||||||
|
self.value.merge_into(&mut spec);
|
||||||
|
spec
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The actual value given by the specification.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
enum SpecValue {
|
||||||
|
None,
|
||||||
|
Fg(Color),
|
||||||
|
Bg(Color),
|
||||||
|
Style(Style),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The set of configurable portions of ripgrep's output.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
enum OutType {
|
||||||
|
Path,
|
||||||
|
Line,
|
||||||
|
Column,
|
||||||
|
Match,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The specification type.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
enum SpecType {
|
||||||
|
Fg,
|
||||||
|
Bg,
|
||||||
|
Style,
|
||||||
|
None,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The set of available styles for use in the terminal.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
enum Style {
|
||||||
|
Bold,
|
||||||
|
NoBold,
|
||||||
|
Intense,
|
||||||
|
NoIntense,
|
||||||
|
Underline,
|
||||||
|
NoUnderline
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ColorSpecs {
|
||||||
|
/// Create color specifications from a list of user supplied
|
||||||
|
/// specifications.
|
||||||
|
pub fn new(specs: &[UserColorSpec]) -> ColorSpecs {
|
||||||
|
let mut merged = ColorSpecs::default();
|
||||||
|
for spec in specs {
|
||||||
|
match spec.ty {
|
||||||
|
OutType::Path => spec.merge_into(&mut merged.path),
|
||||||
|
OutType::Line => spec.merge_into(&mut merged.line),
|
||||||
|
OutType::Column => spec.merge_into(&mut merged.column),
|
||||||
|
OutType::Match => spec.merge_into(&mut merged.matched),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
merged
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a default set of specifications that have color.
|
||||||
|
///
|
||||||
|
/// This is distinct from `ColorSpecs`'s `Default` implementation in that
|
||||||
|
/// this provides a set of default color choices, where as the `Default`
|
||||||
|
/// implementation provides no color choices.
|
||||||
|
pub fn default_with_color() -> ColorSpecs {
|
||||||
|
ColorSpecs::new(&default_color_specs())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the color specification for coloring file paths.
|
||||||
|
pub fn path(&self) -> &ColorSpec {
|
||||||
|
&self.path
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the color specification for coloring line numbers.
|
||||||
|
pub fn line(&self) -> &ColorSpec {
|
||||||
|
&self.line
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the color specification for coloring column numbers.
|
||||||
|
pub fn column(&self) -> &ColorSpec {
|
||||||
|
&self.column
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the color specification for coloring matched text.
|
||||||
|
pub fn matched(&self) -> &ColorSpec {
|
||||||
|
&self.matched
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl UserColorSpec {
|
||||||
|
/// Merge this spec into the given color specification.
|
||||||
|
fn merge_into(&self, cspec: &mut ColorSpec) {
|
||||||
|
self.value.merge_into(cspec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SpecValue {
|
||||||
|
/// Merge this spec value into the given color specification.
|
||||||
|
fn merge_into(&self, cspec: &mut ColorSpec) {
|
||||||
|
match *self {
|
||||||
|
SpecValue::None => cspec.clear(),
|
||||||
|
SpecValue::Fg(ref color) => { cspec.set_fg(Some(color.clone())); }
|
||||||
|
SpecValue::Bg(ref color) => { cspec.set_bg(Some(color.clone())); }
|
||||||
|
SpecValue::Style(ref style) => {
|
||||||
|
match *style {
|
||||||
|
Style::Bold => { cspec.set_bold(true); }
|
||||||
|
Style::NoBold => { cspec.set_bold(false); }
|
||||||
|
Style::Intense => { cspec.set_intense(true); }
|
||||||
|
Style::NoIntense => { cspec.set_intense(false); }
|
||||||
|
Style::Underline => { cspec.set_underline(true); }
|
||||||
|
Style::NoUnderline => { cspec.set_underline(false); }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for UserColorSpec {
|
||||||
|
type Err = ColorError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<UserColorSpec, ColorError> {
|
||||||
|
let pieces: Vec<&str> = s.split(':').collect();
|
||||||
|
if pieces.len() <= 1 || pieces.len() > 3 {
|
||||||
|
return Err(ColorError::InvalidFormat(s.to_string()));
|
||||||
|
}
|
||||||
|
let otype: OutType = pieces[0].parse()?;
|
||||||
|
match pieces[1].parse()? {
|
||||||
|
SpecType::None => {
|
||||||
|
Ok(UserColorSpec {
|
||||||
|
ty: otype,
|
||||||
|
value: SpecValue::None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
SpecType::Style => {
|
||||||
|
if pieces.len() < 3 {
|
||||||
|
return Err(ColorError::InvalidFormat(s.to_string()));
|
||||||
|
}
|
||||||
|
let style: Style = pieces[2].parse()?;
|
||||||
|
Ok(UserColorSpec { ty: otype, value: SpecValue::Style(style) })
|
||||||
|
}
|
||||||
|
SpecType::Fg => {
|
||||||
|
if pieces.len() < 3 {
|
||||||
|
return Err(ColorError::InvalidFormat(s.to_string()));
|
||||||
|
}
|
||||||
|
let color: Color = pieces[2]
|
||||||
|
.parse()
|
||||||
|
.map_err(ColorError::from_parse_error)?;
|
||||||
|
Ok(UserColorSpec { ty: otype, value: SpecValue::Fg(color) })
|
||||||
|
}
|
||||||
|
SpecType::Bg => {
|
||||||
|
if pieces.len() < 3 {
|
||||||
|
return Err(ColorError::InvalidFormat(s.to_string()));
|
||||||
|
}
|
||||||
|
let color: Color = pieces[2]
|
||||||
|
.parse()
|
||||||
|
.map_err(ColorError::from_parse_error)?;
|
||||||
|
Ok(UserColorSpec { ty: otype, value: SpecValue::Bg(color) })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for OutType {
|
||||||
|
type Err = ColorError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<OutType, ColorError> {
|
||||||
|
match &*s.to_lowercase() {
|
||||||
|
"path" => Ok(OutType::Path),
|
||||||
|
"line" => Ok(OutType::Line),
|
||||||
|
"column" => Ok(OutType::Column),
|
||||||
|
"match" => Ok(OutType::Match),
|
||||||
|
_ => Err(ColorError::UnrecognizedOutType(s.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for SpecType {
|
||||||
|
type Err = ColorError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<SpecType, ColorError> {
|
||||||
|
match &*s.to_lowercase() {
|
||||||
|
"fg" => Ok(SpecType::Fg),
|
||||||
|
"bg" => Ok(SpecType::Bg),
|
||||||
|
"style" => Ok(SpecType::Style),
|
||||||
|
"none" => Ok(SpecType::None),
|
||||||
|
_ => Err(ColorError::UnrecognizedSpecType(s.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FromStr for Style {
|
||||||
|
type Err = ColorError;
|
||||||
|
|
||||||
|
fn from_str(s: &str) -> Result<Style, ColorError> {
|
||||||
|
match &*s.to_lowercase() {
|
||||||
|
"bold" => Ok(Style::Bold),
|
||||||
|
"nobold" => Ok(Style::NoBold),
|
||||||
|
"intense" => Ok(Style::Intense),
|
||||||
|
"nointense" => Ok(Style::NoIntense),
|
||||||
|
"underline" => Ok(Style::Underline),
|
||||||
|
"nounderline" => Ok(Style::NoUnderline),
|
||||||
|
_ => Err(ColorError::UnrecognizedStyle(s.to_string())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
90
grep-printer/src/counter.rs
Normal file
90
grep-printer/src/counter.rs
Normal file
@@ -0,0 +1,90 @@
|
|||||||
|
use std::io::{self, Write};
|
||||||
|
|
||||||
|
use termcolor::{ColorSpec, WriteColor};
|
||||||
|
|
||||||
|
/// A writer that counts the number of bytes that have been successfully
|
||||||
|
/// written.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct CounterWriter<W> {
|
||||||
|
wtr: W,
|
||||||
|
count: u64,
|
||||||
|
total_count: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W: Write> CounterWriter<W> {
|
||||||
|
pub fn new(wtr: W) -> CounterWriter<W> {
|
||||||
|
CounterWriter { wtr: wtr, count: 0, total_count: 0 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W> CounterWriter<W> {
|
||||||
|
/// Returns the total number of bytes written since construction or the
|
||||||
|
/// last time `reset` was called.
|
||||||
|
pub fn count(&self) -> u64 {
|
||||||
|
self.count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the total number of bytes written since construction.
|
||||||
|
pub fn total_count(&self) -> u64 {
|
||||||
|
self.total_count + self.count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resets the number of bytes written to `0`.
|
||||||
|
pub fn reset_count(&mut self) {
|
||||||
|
self.total_count += self.count;
|
||||||
|
self.count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear resets all counting related state for this writer.
|
||||||
|
///
|
||||||
|
/// After this call, the total count of bytes written to the underlying
|
||||||
|
/// writer is erased and reset.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn clear(&mut self) {
|
||||||
|
self.count = 0;
|
||||||
|
self.total_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn get_ref(&self) -> &W {
|
||||||
|
&self.wtr
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_mut(&mut self) -> &mut W {
|
||||||
|
&mut self.wtr
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn into_inner(self) -> W {
|
||||||
|
self.wtr
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W: Write> Write for CounterWriter<W> {
|
||||||
|
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
||||||
|
let n = self.wtr.write(buf)?;
|
||||||
|
self.count += n as u64;
|
||||||
|
Ok(n)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn flush(&mut self) -> Result<(), io::Error> {
|
||||||
|
self.wtr.flush()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W: WriteColor> WriteColor for CounterWriter<W> {
|
||||||
|
fn supports_color(&self) -> bool {
|
||||||
|
self.wtr.supports_color()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_color(&mut self, spec: &ColorSpec) -> io::Result<()> {
|
||||||
|
self.wtr.set_color(spec)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn reset(&mut self) -> io::Result<()> {
|
||||||
|
self.wtr.reset()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_synchronous(&self) -> bool {
|
||||||
|
self.wtr.is_synchronous()
|
||||||
|
}
|
||||||
|
}
|
||||||
963
grep-printer/src/json.rs
Normal file
963
grep-printer/src/json.rs
Normal file
@@ -0,0 +1,963 @@
|
|||||||
|
use std::io::{self, Write};
|
||||||
|
use std::path::Path;
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use grep_matcher::{Match, Matcher};
|
||||||
|
use grep_searcher::{
|
||||||
|
Searcher,
|
||||||
|
Sink, SinkError, SinkContext, SinkContextKind, SinkFinish, SinkMatch,
|
||||||
|
};
|
||||||
|
use serde_json as json;
|
||||||
|
|
||||||
|
use counter::CounterWriter;
|
||||||
|
use jsont;
|
||||||
|
use stats::Stats;
|
||||||
|
|
||||||
|
/// The configuration for the JSON printer.
|
||||||
|
///
|
||||||
|
/// This is manipulated by the JSONBuilder and then referenced by the actual
|
||||||
|
/// implementation. Once a printer is build, the configuration is frozen and
|
||||||
|
/// cannot changed.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct Config {
|
||||||
|
pretty: bool,
|
||||||
|
max_matches: Option<u64>,
|
||||||
|
always_begin_end: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Config {
|
||||||
|
fn default() -> Config {
|
||||||
|
Config {
|
||||||
|
pretty: false,
|
||||||
|
max_matches: None,
|
||||||
|
always_begin_end: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A builder for a JSON lines printer.
|
||||||
|
///
|
||||||
|
/// The builder permits configuring how the printer behaves. The JSON printer
|
||||||
|
/// has fewer configuration options than the standard printer because it is
|
||||||
|
/// a structured format, and the printer always attempts to find the most
|
||||||
|
/// information possible.
|
||||||
|
///
|
||||||
|
/// Some configuration options, such as whether line numbers are included or
|
||||||
|
/// whether contextual lines are shown, are drawn directly from the
|
||||||
|
/// `grep_searcher::Searcher`'s configuration.
|
||||||
|
///
|
||||||
|
/// Once a `JSON` printer is built, its configuration cannot be changed.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct JSONBuilder {
|
||||||
|
config: Config,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl JSONBuilder {
|
||||||
|
/// Return a new builder for configuring the JSON printer.
|
||||||
|
pub fn new() -> JSONBuilder {
|
||||||
|
JSONBuilder { config: Config::default() }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a JSON printer that writes results to the given writer.
|
||||||
|
pub fn build<W: io::Write>(&self, wtr: W) -> JSON<W> {
|
||||||
|
JSON {
|
||||||
|
config: self.config.clone(),
|
||||||
|
wtr: CounterWriter::new(wtr),
|
||||||
|
matches: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Print JSON in a pretty printed format.
|
||||||
|
///
|
||||||
|
/// Enabling this will no longer produce a "JSON lines" format, in that
|
||||||
|
/// each JSON object printed may span multiple lines.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder {
|
||||||
|
self.config.pretty = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the maximum amount of matches that are printed.
|
||||||
|
///
|
||||||
|
/// If multi line search is enabled and a match spans multiple lines, then
|
||||||
|
/// that match is counted exactly once for the purposes of enforcing this
|
||||||
|
/// limit, regardless of how many lines it spans.
|
||||||
|
pub fn max_matches(&mut self, limit: Option<u64>) -> &mut JSONBuilder {
|
||||||
|
self.config.max_matches = limit;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When enabled, the `begin` and `end` messages are always emitted, even
|
||||||
|
/// when no match is found.
|
||||||
|
///
|
||||||
|
/// When disabled, the `begin` and `end` messages are only shown if there
|
||||||
|
/// is at least one `match` or `context` message.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder {
|
||||||
|
self.config.always_begin_end = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The JSON printer, which emits results in a JSON lines format.
|
||||||
|
///
|
||||||
|
/// This type is generic over `W`, which represents any implementation of
|
||||||
|
/// the standard library `io::Write` trait.
|
||||||
|
///
|
||||||
|
/// # Format
|
||||||
|
///
|
||||||
|
/// This section describes the JSON format used by this printer.
|
||||||
|
///
|
||||||
|
/// To skip the rigamarole, take a look at the
|
||||||
|
/// [example](#example)
|
||||||
|
/// at the end.
|
||||||
|
///
|
||||||
|
/// ## Overview
|
||||||
|
///
|
||||||
|
/// The format of this printer is the [JSON Lines](http://jsonlines.org/)
|
||||||
|
/// format. Specifically, this printer emits a sequence of messages, where
|
||||||
|
/// each message is encoded as a single JSON value on a single line. There are
|
||||||
|
/// four different types of messages (and this number may expand over time):
|
||||||
|
///
|
||||||
|
/// * **begin** - A message that indicates a file is being searched.
|
||||||
|
/// * **end** - A message the indicates a file is done being searched. This
|
||||||
|
/// message also include summary statistics about the search.
|
||||||
|
/// * **match** - A message that indicates a match was found. This includes
|
||||||
|
/// the text and offsets of the match.
|
||||||
|
/// * **context** - A message that indicates a contextual line was found.
|
||||||
|
/// This includes the text of the line, along with any match information if
|
||||||
|
/// the search was inverted.
|
||||||
|
///
|
||||||
|
/// Every message is encoded in the same envelope format, which includes a tag
|
||||||
|
/// indicating the message type along with an object for the payload:
|
||||||
|
///
|
||||||
|
/// ```json
|
||||||
|
/// {
|
||||||
|
/// "type": "{begin|end|match|context}",
|
||||||
|
/// "data": { ... }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// The message itself is encoded in the envelope's `data` key.
|
||||||
|
///
|
||||||
|
/// ## Text encoding
|
||||||
|
///
|
||||||
|
/// Before describing each message format, we first must briefly discuss text
|
||||||
|
/// encoding, since it factors into every type of message. In particular, JSON
|
||||||
|
/// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this
|
||||||
|
/// printer, we need only worry about UTF-8. The problem here is that searching
|
||||||
|
/// is not limited to UTF-8 exclusively, which in turn implies that matches
|
||||||
|
/// may be reported that contain invalid UTF-8. Moreover, this printer may
|
||||||
|
/// also print file paths, and the encoding of file paths is itself not
|
||||||
|
/// guarnateed to be valid UTF-8. Therefore, this printer must deal with the
|
||||||
|
/// presence of invalid UTF-8 somehow. The printer could silently ignore such
|
||||||
|
/// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8
|
||||||
|
/// by replacing all invalid sequences with the Unicode replacement character.
|
||||||
|
/// However, this would prevent consumers of this format from accessing the
|
||||||
|
/// original data in a non-lossy way.
|
||||||
|
///
|
||||||
|
/// Therefore, this printer will emit valid UTF-8 encoded bytes as normal
|
||||||
|
/// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To
|
||||||
|
/// communicate whether this process occurs or not, strings are keyed by the
|
||||||
|
/// name `text` where as arbitrary bytes are keyed by `bytes`.
|
||||||
|
///
|
||||||
|
/// For example, when a path is included in a message, it is formatted like so,
|
||||||
|
/// if and only if the path is valid UTF-8:
|
||||||
|
///
|
||||||
|
/// ```json
|
||||||
|
/// {
|
||||||
|
/// "path": {
|
||||||
|
/// "text": "/home/ubuntu/lib.rs"
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte
|
||||||
|
/// makes it invalid UTF-8, the path would instead be encoded like so:
|
||||||
|
///
|
||||||
|
/// ```json
|
||||||
|
/// {
|
||||||
|
/// "path": {
|
||||||
|
/// "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM="
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// This same representation is used for reporting matches as well.
|
||||||
|
///
|
||||||
|
/// The printer guarantees that the `text` field is used whenever the
|
||||||
|
/// underlying bytes are valid UTF-8.
|
||||||
|
///
|
||||||
|
/// ## Wire format
|
||||||
|
///
|
||||||
|
/// This section documents the wire format emitted by this printer, starting
|
||||||
|
/// with the four types of messages.
|
||||||
|
///
|
||||||
|
/// Each message has its own format, and is contained inside an envelope that
|
||||||
|
/// indicates the type of message. The envelope has these fields:
|
||||||
|
///
|
||||||
|
/// * **type** - A string indicating the type of this message. It may be one
|
||||||
|
/// of four possible strings: `begin`, `end`, `match` or `context`. This
|
||||||
|
/// list may expand over time.
|
||||||
|
/// * **data** - The actual message data. The format of this field depends on
|
||||||
|
/// the value of `type`. The possible message formats are
|
||||||
|
/// [`begin`](#message-begin),
|
||||||
|
/// [`end`](#message-end),
|
||||||
|
/// [`match`](#message-match),
|
||||||
|
/// [`context`](#message-context).
|
||||||
|
///
|
||||||
|
/// #### Message: **begin**
|
||||||
|
///
|
||||||
|
/// This message indicates that a search has begun. It has these fields:
|
||||||
|
///
|
||||||
|
/// * **path** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// representing the file path corresponding to the search, if one is
|
||||||
|
/// present. If no file path is available, then this field is `null`.
|
||||||
|
///
|
||||||
|
/// #### Message: **end**
|
||||||
|
///
|
||||||
|
/// This message indicates that a search has finished. It has these fields:
|
||||||
|
///
|
||||||
|
/// * **path** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// representing the file path corresponding to the search, if one is
|
||||||
|
/// present. If no file path is available, then this field is `null`.
|
||||||
|
/// * **binary_offset** - The absolute offset in the data searched
|
||||||
|
/// corresponding to the place at which binary data was detected. If no
|
||||||
|
/// binary data was detected (or if binary detection was disabled), then this
|
||||||
|
/// field is `null`.
|
||||||
|
/// * **stats** - A [`stats` object](#object-stats) that contains summary
|
||||||
|
/// statistics for the previous search.
|
||||||
|
///
|
||||||
|
/// #### Message: **match**
|
||||||
|
///
|
||||||
|
/// This message indicates that a match has been found. A match generally
|
||||||
|
/// corresponds to a single line of text, although it may correspond to
|
||||||
|
/// multiple lines if the search can emit matches over multiple lines. It
|
||||||
|
/// has these fields:
|
||||||
|
///
|
||||||
|
/// * **path** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// representing the file path corresponding to the search, if one is
|
||||||
|
/// present. If no file path is available, then this field is `null`.
|
||||||
|
/// * **lines** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// representing one or more lines contained in this match.
|
||||||
|
/// * **line_number** - If the searcher has been configured to report line
|
||||||
|
/// numbers, then this corresponds to the line number of the first line
|
||||||
|
/// in `lines`. If no line numbers are available, then this is `null`.
|
||||||
|
/// * **absolute_offset** - The absolute byte offset corresponding to the start
|
||||||
|
/// of `lines` in the data being searched.
|
||||||
|
/// * **submatches** - An array of [`submatch` objects](#object-submatch)
|
||||||
|
/// corresponding to matches in `lines`. The offsets included in each
|
||||||
|
/// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
|
||||||
|
/// encoded, then the byte offsets correspond to the data after base64
|
||||||
|
/// decoding.) The `submatch` objects are guaranteed to be sorted by their
|
||||||
|
/// starting offsets. Note that it is possible for this array to be empty,
|
||||||
|
/// for example, when searching reports inverted matches.
|
||||||
|
///
|
||||||
|
/// #### Message: **context**
|
||||||
|
///
|
||||||
|
/// This message indicates that a contextual line has been found. A contextual
|
||||||
|
/// line is a line that doesn't contain a match, but is generally adjacent to
|
||||||
|
/// a line that does contain a match. The precise way in which contextual lines
|
||||||
|
/// are reported is determined by the searcher. It has these fields, which are
|
||||||
|
/// exactly the same fields found in a [`match`](#message-match):
|
||||||
|
///
|
||||||
|
/// * **path** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// representing the file path corresponding to the search, if one is
|
||||||
|
/// present. If no file path is available, then this field is `null`.
|
||||||
|
/// * **lines** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// representing one or more lines contained in this context. This includes
|
||||||
|
/// line terminators, if they're present.
|
||||||
|
/// * **line_number** - If the searcher has been configured to report line
|
||||||
|
/// numbers, then this corresponds to the line number of the first line
|
||||||
|
/// in `lines`. If no line numbers are available, then this is `null`.
|
||||||
|
/// * **absolute_offset** - The absolute byte offset corresponding to the start
|
||||||
|
/// of `lines` in the data being searched.
|
||||||
|
/// * **submatches** - An array of [`submatch` objects](#object-submatch)
|
||||||
|
/// corresponding to matches in `lines`. The offsets included in each
|
||||||
|
/// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64
|
||||||
|
/// encoded, then the byte offsets correspond to the data after base64
|
||||||
|
/// decoding.) The `submatch` objects are guaranteed to be sorted by
|
||||||
|
/// their starting offsets. Note that it is possible for this array to be
|
||||||
|
/// non-empty, for example, when searching reports inverted matches such that
|
||||||
|
/// the original matcher could match things in the contextual lines.
|
||||||
|
///
|
||||||
|
/// #### Object: **submatch**
|
||||||
|
///
|
||||||
|
/// This object describes submatches found within `match` or `context`
|
||||||
|
/// messages. The `start` and `end` fields indicate the half-open interval on
|
||||||
|
/// which the match occurs (`start` is included, but `end` is not). It is
|
||||||
|
/// guaranteed that `start <= end`. It has these fields:
|
||||||
|
///
|
||||||
|
/// * **match** - An
|
||||||
|
/// [arbitrary data object](#object-arbitrary-data)
|
||||||
|
/// corresponding to the text in this submatch.
|
||||||
|
/// * **start** - A byte offset indicating the start of this match. This offset
|
||||||
|
/// is generally reported in terms of the parent object's data. For example,
|
||||||
|
/// the `lines` field in the
|
||||||
|
/// [`match`](#message-match) or [`context`](#message-context)
|
||||||
|
/// messages.
|
||||||
|
/// * **end** - A byte offset indicating the end of this match. This offset
|
||||||
|
/// is generally reported in terms of the parent object's data. For example,
|
||||||
|
/// the `lines` field in the
|
||||||
|
/// [`match`](#message-match) or [`context`](#message-context)
|
||||||
|
/// messages.
|
||||||
|
///
|
||||||
|
/// #### Object: **stats**
|
||||||
|
///
|
||||||
|
/// This object is included in messages and contains summary statistics about
|
||||||
|
/// a search. It has these fields:
|
||||||
|
///
|
||||||
|
/// * **elapsed** - A [`duration` object](#object-duration) describing the
|
||||||
|
/// length of time that elapsed while performing the search.
|
||||||
|
/// * **searches** - The number of searches that have run. For this printer,
|
||||||
|
/// this value is always `1`. (Implementations may emit additional message
|
||||||
|
/// types that use this same `stats` object that represents summary
|
||||||
|
/// statistics over multiple searches.)
|
||||||
|
/// * **searches_with_match** - The number of searches that have run that have
|
||||||
|
/// found at least one match. This is never more than `searches`.
|
||||||
|
/// * **bytes_searched** - The total number of bytes that have been searched.
|
||||||
|
/// * **bytes_printed** - The total number of bytes that have been printed.
|
||||||
|
/// This includes everything emitted by this printer.
|
||||||
|
/// * **matched_lines** - The total number of lines that participated in a
|
||||||
|
/// match. When matches may contain multiple lines, then this includes every
|
||||||
|
/// line that is part of every match.
|
||||||
|
/// * **matches** - The total number of matches. There may be multiple matches
|
||||||
|
/// per line. When matches may contain multiple lines, each match is counted
|
||||||
|
/// only once, regardless of how many lines it spans.
|
||||||
|
///
|
||||||
|
/// #### Object: **duration**
|
||||||
|
///
|
||||||
|
/// This object includes a few fields for describing a duration. Two of its
|
||||||
|
/// fields, `secs` and `nanos`, can be combined to give nanosecond precision
|
||||||
|
/// on systems that support it. It has these fields:
|
||||||
|
///
|
||||||
|
/// * **secs** - A whole number of seconds indicating the length of this
|
||||||
|
/// duration.
|
||||||
|
/// * **nanos** - A fractional part of this duration represent by nanoseconds.
|
||||||
|
/// If nanosecond precision isn't supported, then this is typically rounded
|
||||||
|
/// up to the nearest number of nanoseconds.
|
||||||
|
/// * **human** - A human readable string describing the length of the
|
||||||
|
/// duration. The format of the string is itself unspecified.
|
||||||
|
///
|
||||||
|
/// #### Object: **arbitrary data**
|
||||||
|
///
|
||||||
|
/// This object is used whenever arbitrary data needs to be represented as a
|
||||||
|
/// JSON value. This object contains two fields, where generally only one of
|
||||||
|
/// the fields is present:
|
||||||
|
///
|
||||||
|
/// * **text** - A normal JSON string that is UTF-8 encoded. This field is
|
||||||
|
/// populated if and only if the underlying data is valid UTF-8.
|
||||||
|
/// * **bytes** - A normal JSON string that is a base64 encoding of the
|
||||||
|
/// underlying bytes.
|
||||||
|
///
|
||||||
|
/// More information on the motivation for this representation can be seen in
|
||||||
|
/// the section [text encoding](#text-encoding) above.
|
||||||
|
///
|
||||||
|
/// ## Example
|
||||||
|
///
|
||||||
|
/// This section shows a small example that includes all message types.
|
||||||
|
///
|
||||||
|
/// Here's the file we want to search, located at `/home/andrew/sherlock`:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
/// Holmeses, success in the province of detective work must always
|
||||||
|
/// be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
/// can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
/// but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
/// and exhibited clearly, with a label attached.
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Searching for `Watson` with a `before_context` of `1` with line numbers
|
||||||
|
/// enabled shows something like this using the standard printer:
|
||||||
|
///
|
||||||
|
/// ```text
|
||||||
|
/// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
/// --
|
||||||
|
/// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
/// sherlock:5:but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// Here's what the same search looks like using the JSON wire format described
|
||||||
|
/// above, where in we show semi-prettified JSON (instead of a strict JSON
|
||||||
|
/// Lines format), for illustrative purposes:
|
||||||
|
///
|
||||||
|
/// ```json
|
||||||
|
/// {
|
||||||
|
/// "type": "begin",
|
||||||
|
/// "data": {
|
||||||
|
/// "path": {"text": "/home/andrew/sherlock"}}
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// "type": "match",
|
||||||
|
/// "data": {
|
||||||
|
/// "path": {"text": "/home/andrew/sherlock"},
|
||||||
|
/// "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"},
|
||||||
|
/// "line_number": 1,
|
||||||
|
/// "absolute_offset": 0,
|
||||||
|
/// "submatches": [
|
||||||
|
/// {"match": {"text": "Watson"}, "start": 15, "end": 21}
|
||||||
|
/// ]
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// "type": "context",
|
||||||
|
/// "data": {
|
||||||
|
/// "path": {"text": "/home/andrew/sherlock"},
|
||||||
|
/// "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"},
|
||||||
|
/// "line_number": 4,
|
||||||
|
/// "absolute_offset": 193,
|
||||||
|
/// "submatches": []
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// "type": "match",
|
||||||
|
/// "data": {
|
||||||
|
/// "path": {"text": "/home/andrew/sherlock"},
|
||||||
|
/// "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"},
|
||||||
|
/// "line_number": 5,
|
||||||
|
/// "absolute_offset": 258,
|
||||||
|
/// "submatches": [
|
||||||
|
/// {"match": {"text": "Watson"}, "start": 11, "end": 17}
|
||||||
|
/// ]
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// {
|
||||||
|
/// "type": "end",
|
||||||
|
/// "data": {
|
||||||
|
/// "path": {"text": "/home/andrew/sherlock"},
|
||||||
|
/// "binary_offset": null,
|
||||||
|
/// "stats": {
|
||||||
|
/// "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"},
|
||||||
|
/// "searches": 1,
|
||||||
|
/// "searches_with_match": 1,
|
||||||
|
/// "bytes_searched": 367,
|
||||||
|
/// "bytes_printed": 1151,
|
||||||
|
/// "matched_lines": 2,
|
||||||
|
/// "matches": 2
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct JSON<W> {
|
||||||
|
config: Config,
|
||||||
|
wtr: CounterWriter<W>,
|
||||||
|
matches: Vec<Match>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W: io::Write> JSON<W> {
|
||||||
|
/// Return a JSON lines printer with a default configuration that writes
|
||||||
|
/// matches to the given writer.
|
||||||
|
pub fn new(wtr: W) -> JSON<W> {
|
||||||
|
JSONBuilder::new().build(wtr)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return an implementation of `Sink` for the JSON printer.
|
||||||
|
///
|
||||||
|
/// This does not associate the printer with a file path, which means this
|
||||||
|
/// implementation will never print a file path along with the matches.
|
||||||
|
pub fn sink<'s, M: Matcher>(
|
||||||
|
&'s mut self,
|
||||||
|
matcher: M,
|
||||||
|
) -> JSONSink<'static, 's, M, W> {
|
||||||
|
JSONSink {
|
||||||
|
matcher: matcher,
|
||||||
|
json: self,
|
||||||
|
path: None,
|
||||||
|
start_time: Instant::now(),
|
||||||
|
match_count: 0,
|
||||||
|
after_context_remaining: 0,
|
||||||
|
binary_byte_offset: None,
|
||||||
|
begin_printed: false,
|
||||||
|
stats: Stats::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return an implementation of `Sink` associated with a file path.
|
||||||
|
///
|
||||||
|
/// When the printer is associated with a path, then it may, depending on
|
||||||
|
/// its configuration, print the path along with the matches found.
|
||||||
|
pub fn sink_with_path<'p, 's, M, P>(
|
||||||
|
&'s mut self,
|
||||||
|
matcher: M,
|
||||||
|
path: &'p P,
|
||||||
|
) -> JSONSink<'p, 's, M, W>
|
||||||
|
where M: Matcher,
|
||||||
|
P: ?Sized + AsRef<Path>,
|
||||||
|
{
|
||||||
|
JSONSink {
|
||||||
|
matcher: matcher,
|
||||||
|
json: self,
|
||||||
|
path: Some(path.as_ref()),
|
||||||
|
start_time: Instant::now(),
|
||||||
|
match_count: 0,
|
||||||
|
after_context_remaining: 0,
|
||||||
|
binary_byte_offset: None,
|
||||||
|
begin_printed: false,
|
||||||
|
stats: Stats::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the given message followed by a new line. The new line is
|
||||||
|
/// determined from the configuration of the given searcher.
|
||||||
|
fn write_message(&mut self, message: &jsont::Message) -> io::Result<()> {
|
||||||
|
if self.config.pretty {
|
||||||
|
json::to_writer_pretty(&mut self.wtr, message)?;
|
||||||
|
} else {
|
||||||
|
json::to_writer(&mut self.wtr, message)?;
|
||||||
|
}
|
||||||
|
self.wtr.write(&[b'\n'])?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<W> JSON<W> {
|
||||||
|
/// Returns true if and only if this printer has written at least one byte
|
||||||
|
/// to the underlying writer during any of the previous searches.
|
||||||
|
pub fn has_written(&self) -> bool {
|
||||||
|
self.wtr.total_count() > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a mutable reference to the underlying writer.
|
||||||
|
pub fn get_mut(&mut self) -> &mut W {
|
||||||
|
self.wtr.get_mut()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume this printer and return back ownership of the underlying
|
||||||
|
/// writer.
|
||||||
|
pub fn into_inner(self) -> W {
|
||||||
|
self.wtr.into_inner()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An implementation of `Sink` associated with a matcher and an optional file
|
||||||
|
/// path for the JSON printer.
|
||||||
|
///
|
||||||
|
/// This type is generic over a few type parameters:
|
||||||
|
///
|
||||||
|
/// * `'p` refers to the lifetime of the file path, if one is provided. When
|
||||||
|
/// no file path is given, then this is `'static`.
|
||||||
|
/// * `'s` refers to the lifetime of the
|
||||||
|
/// [`JSON`](struct.JSON.html)
|
||||||
|
/// printer that this type borrows.
|
||||||
|
/// * `M` refers to the type of matcher used by
|
||||||
|
/// `grep_searcher::Searcher` that is reporting results to this sink.
|
||||||
|
/// * `W` refers to the underlying writer that this printer is writing its
|
||||||
|
/// output to.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct JSONSink<'p, 's, M: Matcher, W: 's> {
|
||||||
|
matcher: M,
|
||||||
|
json: &'s mut JSON<W>,
|
||||||
|
path: Option<&'p Path>,
|
||||||
|
start_time: Instant,
|
||||||
|
match_count: u64,
|
||||||
|
after_context_remaining: u64,
|
||||||
|
binary_byte_offset: Option<u64>,
|
||||||
|
begin_printed: bool,
|
||||||
|
stats: Stats,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> {
|
||||||
|
/// Returns true if and only if this printer received a match in the
|
||||||
|
/// previous search.
|
||||||
|
///
|
||||||
|
/// This is unaffected by the result of searches before the previous
|
||||||
|
/// search.
|
||||||
|
pub fn has_match(&self) -> bool {
|
||||||
|
self.match_count > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of matches reported to this sink.
|
||||||
|
///
|
||||||
|
/// This corresponds to the number of times `Sink::matched` is called.
|
||||||
|
pub fn match_count(&self) -> u64 {
|
||||||
|
self.match_count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If binary data was found in the previous search, this returns the
|
||||||
|
/// offset at which the binary data was first detected.
|
||||||
|
///
|
||||||
|
/// The offset returned is an absolute offset relative to the entire
|
||||||
|
/// set of bytes searched.
|
||||||
|
///
|
||||||
|
/// This is unaffected by the result of searches before the previous
|
||||||
|
/// search. e.g., If the search prior to the previous search found binary
|
||||||
|
/// data but the previous search found no binary data, then this will
|
||||||
|
/// return `None`.
|
||||||
|
pub fn binary_byte_offset(&self) -> Option<u64> {
|
||||||
|
self.binary_byte_offset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a reference to the stats produced by the printer for all
|
||||||
|
/// searches executed on this sink.
|
||||||
|
pub fn stats(&self) -> &Stats {
|
||||||
|
&self.stats
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute the matcher over the given bytes and record the match
|
||||||
|
/// locations if the current configuration demands match granularity.
|
||||||
|
fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> {
|
||||||
|
self.json.matches.clear();
|
||||||
|
// If printing requires knowing the location of each individual match,
|
||||||
|
// then compute and stored those right now for use later. While this
|
||||||
|
// adds an extra copy for storing the matches, we do amortize the
|
||||||
|
// allocation for it and this greatly simplifies the printing logic to
|
||||||
|
// the extent that it's easy to ensure that we never do more than
|
||||||
|
// one search to find the matches.
|
||||||
|
let matches = &mut self.json.matches;
|
||||||
|
self.matcher.find_iter(bytes, |m| {
|
||||||
|
matches.push(m);
|
||||||
|
true
|
||||||
|
}).map_err(io::Error::error_message)?;
|
||||||
|
// Don't report empty matches appearing at the end of the bytes.
|
||||||
|
if !matches.is_empty()
|
||||||
|
&& matches.last().unwrap().is_empty()
|
||||||
|
&& matches.last().unwrap().start() >= bytes.len()
|
||||||
|
{
|
||||||
|
matches.pop().unwrap();
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if this printer should quit.
|
||||||
|
///
|
||||||
|
/// This implements the logic for handling quitting after seeing a certain
|
||||||
|
/// amount of matches. In most cases, the logic is simple, but we must
|
||||||
|
/// permit all "after" contextual lines to print after reaching the limit.
|
||||||
|
fn should_quit(&self) -> bool {
|
||||||
|
let limit = match self.json.config.max_matches {
|
||||||
|
None => return false,
|
||||||
|
Some(limit) => limit,
|
||||||
|
};
|
||||||
|
if self.match_count < limit {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
self.after_context_remaining == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write the "begin" message.
|
||||||
|
fn write_begin_message(&mut self) -> io::Result<()> {
|
||||||
|
if self.begin_printed {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
let msg = jsont::Message::Begin(jsont::Begin {
|
||||||
|
path: self.path,
|
||||||
|
});
|
||||||
|
self.json.write_message(&msg)?;
|
||||||
|
self.begin_printed = true;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> {
|
||||||
|
type Error = io::Error;
|
||||||
|
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
self.write_begin_message()?;
|
||||||
|
|
||||||
|
self.match_count += 1;
|
||||||
|
self.after_context_remaining = searcher.after_context() as u64;
|
||||||
|
self.record_matches(mat.bytes())?;
|
||||||
|
self.stats.add_matches(self.json.matches.len() as u64);
|
||||||
|
self.stats.add_matched_lines(mat.lines().count() as u64);
|
||||||
|
|
||||||
|
let submatches = SubMatches::new(mat.bytes(), &self.json.matches);
|
||||||
|
let msg = jsont::Message::Match(jsont::Match {
|
||||||
|
path: self.path,
|
||||||
|
lines: mat.bytes(),
|
||||||
|
line_number: mat.line_number(),
|
||||||
|
absolute_offset: mat.absolute_byte_offset(),
|
||||||
|
submatches: submatches.as_slice(),
|
||||||
|
});
|
||||||
|
self.json.write_message(&msg)?;
|
||||||
|
Ok(!self.should_quit())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn context(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
ctx: &SinkContext,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
self.write_begin_message()?;
|
||||||
|
self.json.matches.clear();
|
||||||
|
|
||||||
|
if ctx.kind() == &SinkContextKind::After {
|
||||||
|
self.after_context_remaining =
|
||||||
|
self.after_context_remaining.saturating_sub(1);
|
||||||
|
}
|
||||||
|
let submatches =
|
||||||
|
if searcher.invert_match() {
|
||||||
|
self.record_matches(ctx.bytes())?;
|
||||||
|
SubMatches::new(ctx.bytes(), &self.json.matches)
|
||||||
|
} else {
|
||||||
|
SubMatches::empty()
|
||||||
|
};
|
||||||
|
let msg = jsont::Message::Context(jsont::Context {
|
||||||
|
path: self.path,
|
||||||
|
lines: ctx.bytes(),
|
||||||
|
line_number: ctx.line_number(),
|
||||||
|
absolute_offset: ctx.absolute_byte_offset(),
|
||||||
|
submatches: submatches.as_slice(),
|
||||||
|
});
|
||||||
|
self.json.write_message(&msg)?;
|
||||||
|
Ok(!self.should_quit())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn begin(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
self.json.wtr.reset_count();
|
||||||
|
self.start_time = Instant::now();
|
||||||
|
self.match_count = 0;
|
||||||
|
self.after_context_remaining = 0;
|
||||||
|
self.binary_byte_offset = None;
|
||||||
|
if self.json.config.max_matches == Some(0) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.json.config.always_begin_end {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
self.write_begin_message()?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn finish(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
finish: &SinkFinish,
|
||||||
|
) -> Result<(), io::Error> {
|
||||||
|
if !self.begin_printed {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
self.binary_byte_offset = finish.binary_byte_offset();
|
||||||
|
self.stats.add_elapsed(self.start_time.elapsed());
|
||||||
|
self.stats.add_searches(1);
|
||||||
|
if self.match_count > 0 {
|
||||||
|
self.stats.add_searches_with_match(1);
|
||||||
|
}
|
||||||
|
self.stats.add_bytes_searched(finish.byte_count());
|
||||||
|
self.stats.add_bytes_printed(self.json.wtr.count());
|
||||||
|
|
||||||
|
let msg = jsont::Message::End(jsont::End {
|
||||||
|
path: self.path,
|
||||||
|
binary_offset: finish.binary_byte_offset(),
|
||||||
|
stats: self.stats.clone(),
|
||||||
|
});
|
||||||
|
self.json.write_message(&msg)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SubMatches represents a set of matches in a contiguous range of bytes.
|
||||||
|
///
|
||||||
|
/// A simpler representation for this would just simply be `Vec<SubMatch>`,
|
||||||
|
/// but the common case is exactly one match per range of bytes, which we
|
||||||
|
/// specialize here using a fixed size array without any allocation.
|
||||||
|
enum SubMatches<'a> {
|
||||||
|
Empty,
|
||||||
|
Small([jsont::SubMatch<'a>; 1]),
|
||||||
|
Big(Vec<jsont::SubMatch<'a>>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> SubMatches<'a> {
|
||||||
|
/// Create a new set of match ranges from a set of matches and the
|
||||||
|
/// corresponding bytes that those matches apply to.
|
||||||
|
fn new(bytes: &'a[u8], matches: &[Match]) -> SubMatches<'a> {
|
||||||
|
if matches.len() == 1 {
|
||||||
|
let mat = matches[0];
|
||||||
|
SubMatches::Small([jsont::SubMatch {
|
||||||
|
m: &bytes[mat],
|
||||||
|
start: mat.start(),
|
||||||
|
end: mat.end(),
|
||||||
|
}])
|
||||||
|
} else {
|
||||||
|
let mut match_ranges = vec![];
|
||||||
|
for &mat in matches {
|
||||||
|
match_ranges.push(jsont::SubMatch {
|
||||||
|
m: &bytes[mat],
|
||||||
|
start: mat.start(),
|
||||||
|
end: mat.end(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
SubMatches::Big(match_ranges)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an empty set of match ranges.
|
||||||
|
fn empty() -> SubMatches<'static> {
|
||||||
|
SubMatches::Empty
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return this set of match ranges as a slice.
|
||||||
|
fn as_slice(&self) -> &[jsont::SubMatch] {
|
||||||
|
match *self {
|
||||||
|
SubMatches::Empty => &[],
|
||||||
|
SubMatches::Small(ref x) => x,
|
||||||
|
SubMatches::Big(ref x) => x,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use grep_regex::{RegexMatcher, RegexMatcherBuilder};
|
||||||
|
use grep_matcher::LineTerminator;
|
||||||
|
use grep_searcher::SearcherBuilder;
|
||||||
|
|
||||||
|
use super::{JSON, JSONBuilder};
|
||||||
|
|
||||||
|
const SHERLOCK: &'static [u8] = b"\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, success in the province of detective work must always
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
and exhibited clearly, with a label attached.
|
||||||
|
";
|
||||||
|
|
||||||
|
fn printer_contents(
|
||||||
|
printer: &mut JSON<Vec<u8>>,
|
||||||
|
) -> String {
|
||||||
|
String::from_utf8(printer.get_mut().to_owned()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn binary_detection() {
|
||||||
|
use grep_searcher::BinaryDetection;
|
||||||
|
|
||||||
|
const BINARY: &'static [u8] = b"\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, success in the province of detective work must always
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
can extract a clew \x00 from a wisp of straw or a flake of cigar ash;
|
||||||
|
but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
and exhibited clearly, with a label attached.\
|
||||||
|
";
|
||||||
|
|
||||||
|
let matcher = RegexMatcher::new(
|
||||||
|
r"Watson"
|
||||||
|
).unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::quit(b'\x00'))
|
||||||
|
.heap_limit(Some(80))
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, BINARY, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
|
||||||
|
assert_eq!(got.lines().count(), 3);
|
||||||
|
let last = got.lines().last().unwrap();
|
||||||
|
assert!(last.contains(r#""binary_offset":212,"#));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn max_matches() {
|
||||||
|
let matcher = RegexMatcher::new(
|
||||||
|
r"Watson"
|
||||||
|
).unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.max_matches(Some(1))
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
|
||||||
|
assert_eq!(got.lines().count(), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn no_match() {
|
||||||
|
let matcher = RegexMatcher::new(
|
||||||
|
r"DOES NOT MATCH"
|
||||||
|
).unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
|
||||||
|
assert!(got.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn always_begin_end_no_match() {
|
||||||
|
let matcher = RegexMatcher::new(
|
||||||
|
r"DOES NOT MATCH"
|
||||||
|
).unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.always_begin_end(true)
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, SHERLOCK, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
|
||||||
|
assert_eq!(got.lines().count(), 2);
|
||||||
|
assert!(got.contains("begin") && got.contains("end"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn missing_crlf() {
|
||||||
|
let haystack = "test\r\n".as_bytes();
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.build("test")
|
||||||
|
.unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, haystack, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
assert_eq!(got.lines().count(), 3);
|
||||||
|
assert!(
|
||||||
|
got.lines().nth(1).unwrap().contains(r"test\r\n"),
|
||||||
|
r"missing 'test\r\n' in '{}'",
|
||||||
|
got.lines().nth(1).unwrap(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.crlf(true)
|
||||||
|
.build("test")
|
||||||
|
.unwrap();
|
||||||
|
let mut printer = JSONBuilder::new()
|
||||||
|
.build(vec![]);
|
||||||
|
SearcherBuilder::new()
|
||||||
|
.line_terminator(LineTerminator::crlf())
|
||||||
|
.build()
|
||||||
|
.search_reader(&matcher, haystack, printer.sink(&matcher))
|
||||||
|
.unwrap();
|
||||||
|
let got = printer_contents(&mut printer);
|
||||||
|
assert_eq!(got.lines().count(), 3);
|
||||||
|
assert!(
|
||||||
|
got.lines().nth(1).unwrap().contains(r"test\r\n"),
|
||||||
|
r"missing 'test\r\n' in '{}'",
|
||||||
|
got.lines().nth(1).unwrap(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
147
grep-printer/src/jsont.rs
Normal file
147
grep-printer/src/jsont.rs
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
// This module defines the types we use for JSON serialization. We specifically
|
||||||
|
// omit deserialization, partially because there isn't a clear use case for
|
||||||
|
// them at this time, but also because deserialization will complicate things.
|
||||||
|
// Namely, the types below are designed in a way that permits JSON
|
||||||
|
// serialization with little or no allocation. Allocation is often quite
|
||||||
|
// convenient for deserialization however, so these types would become a bit
|
||||||
|
// more complex.
|
||||||
|
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
use base64;
|
||||||
|
use serde::{Serialize, Serializer};
|
||||||
|
|
||||||
|
use stats::Stats;
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
#[serde(tag = "type", content = "data")]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum Message<'a> {
|
||||||
|
Begin(Begin<'a>),
|
||||||
|
End(End<'a>),
|
||||||
|
Match(Match<'a>),
|
||||||
|
Context(Context<'a>),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct Begin<'a> {
|
||||||
|
#[serde(serialize_with = "ser_path")]
|
||||||
|
pub path: Option<&'a Path>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct End<'a> {
|
||||||
|
#[serde(serialize_with = "ser_path")]
|
||||||
|
pub path: Option<&'a Path>,
|
||||||
|
pub binary_offset: Option<u64>,
|
||||||
|
pub stats: Stats,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct Match<'a> {
|
||||||
|
#[serde(serialize_with = "ser_path")]
|
||||||
|
pub path: Option<&'a Path>,
|
||||||
|
#[serde(serialize_with = "ser_bytes")]
|
||||||
|
pub lines: &'a [u8],
|
||||||
|
pub line_number: Option<u64>,
|
||||||
|
pub absolute_offset: u64,
|
||||||
|
pub submatches: &'a [SubMatch<'a>],
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct Context<'a> {
|
||||||
|
#[serde(serialize_with = "ser_path")]
|
||||||
|
pub path: Option<&'a Path>,
|
||||||
|
#[serde(serialize_with = "ser_bytes")]
|
||||||
|
pub lines: &'a [u8],
|
||||||
|
pub line_number: Option<u64>,
|
||||||
|
pub absolute_offset: u64,
|
||||||
|
pub submatches: &'a [SubMatch<'a>],
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
pub struct SubMatch<'a> {
|
||||||
|
#[serde(rename = "match")]
|
||||||
|
#[serde(serialize_with = "ser_bytes")]
|
||||||
|
pub m: &'a [u8],
|
||||||
|
pub start: usize,
|
||||||
|
pub end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Data represents things that look like strings, but may actually not be
|
||||||
|
/// valid UTF-8. To handle this, `Data` is serialized as an object with one
|
||||||
|
/// of two keys: `text` (for valid UTF-8) or `bytes` (for invalid UTF-8).
|
||||||
|
///
|
||||||
|
/// The happy path is valid UTF-8, which streams right through as-is, since
|
||||||
|
/// it is natively supported by JSON. When invalid UTF-8 is found, then it is
|
||||||
|
/// represented as arbitrary bytes and base64 encoded.
|
||||||
|
#[derive(Clone, Debug, Hash, PartialEq, Eq, Serialize)]
|
||||||
|
#[serde(untagged)]
|
||||||
|
enum Data<'a> {
|
||||||
|
Text { text: Cow<'a, str> },
|
||||||
|
Bytes {
|
||||||
|
#[serde(serialize_with = "to_base64")]
|
||||||
|
bytes: &'a [u8],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Data<'a> {
|
||||||
|
fn from_bytes(bytes: &[u8]) -> Data {
|
||||||
|
match str::from_utf8(bytes) {
|
||||||
|
Ok(text) => Data::Text { text: Cow::Borrowed(text) },
|
||||||
|
Err(_) => Data::Bytes { bytes },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(unix)]
|
||||||
|
fn from_path(path: &Path) -> Data {
|
||||||
|
use std::os::unix::ffi::OsStrExt;
|
||||||
|
|
||||||
|
match path.to_str() {
|
||||||
|
Some(text) => Data::Text { text: Cow::Borrowed(text) },
|
||||||
|
None => Data::Bytes { bytes: path.as_os_str().as_bytes() },
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(unix))]
|
||||||
|
fn from_path(path: &Path) -> Data {
|
||||||
|
// Using lossy conversion means some paths won't round trip precisely,
|
||||||
|
// but it's not clear what we should actually do. Serde rejects
|
||||||
|
// non-UTF-8 paths, and OsStr's are serialized as a sequence of UTF-16
|
||||||
|
// code units on Windows. Neither seem appropriate for this use case,
|
||||||
|
// so we do the easy thing for now.
|
||||||
|
Data::Text { text: path.to_string_lossy() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_base64<T, S>(
|
||||||
|
bytes: T,
|
||||||
|
ser: S,
|
||||||
|
) -> Result<S::Ok, S::Error>
|
||||||
|
where T: AsRef<[u8]>,
|
||||||
|
S: Serializer
|
||||||
|
{
|
||||||
|
ser.serialize_str(&base64::encode(&bytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ser_bytes<T, S>(
|
||||||
|
bytes: T,
|
||||||
|
ser: S,
|
||||||
|
) -> Result<S::Ok, S::Error>
|
||||||
|
where T: AsRef<[u8]>,
|
||||||
|
S: Serializer
|
||||||
|
{
|
||||||
|
Data::from_bytes(bytes.as_ref()).serialize(ser)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn ser_path<P, S>(
|
||||||
|
path: &Option<P>,
|
||||||
|
ser: S,
|
||||||
|
) -> Result<S::Ok, S::Error>
|
||||||
|
where P: AsRef<Path>,
|
||||||
|
S: Serializer
|
||||||
|
{
|
||||||
|
path.as_ref().map(|p| Data::from_path(p.as_ref())).serialize(ser)
|
||||||
|
}
|
||||||
106
grep-printer/src/lib.rs
Normal file
106
grep-printer/src/lib.rs
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
/*!
|
||||||
|
This crate provides featureful and fast printers that interoperate with the
|
||||||
|
[`grep-searcher`](https://docs.rs/grep-searcher)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
# Brief overview
|
||||||
|
|
||||||
|
The [`Standard`](struct.Standard.html) printer shows results in a human
|
||||||
|
readable format, and is modeled after the formats used by standard grep-like
|
||||||
|
tools. Features include, but are not limited to, cross platform terminal
|
||||||
|
coloring, search & replace, multi-line result handling and reporting summary
|
||||||
|
statistics.
|
||||||
|
|
||||||
|
The [`JSON`](struct.JSON.html) printer shows results in a machine readable
|
||||||
|
format. To facilitate a stream of search results, the format uses
|
||||||
|
[JSON Lines](http://jsonlines.org/)
|
||||||
|
by emitting a series of messages as search results are found.
|
||||||
|
|
||||||
|
The [`Summary`](struct.Summary.html) printer shows *aggregate* results for a
|
||||||
|
single search in a human readable format, and is modeled after similar formats
|
||||||
|
found in standard grep-like tools. This printer is useful for showing the total
|
||||||
|
number of matches and/or printing file paths that either contain or don't
|
||||||
|
contain matches.
|
||||||
|
|
||||||
|
# Example
|
||||||
|
|
||||||
|
This example shows how to create a "standard" printer and execute a search.
|
||||||
|
|
||||||
|
```
|
||||||
|
extern crate grep_regex;
|
||||||
|
extern crate grep_printer;
|
||||||
|
extern crate grep_searcher;
|
||||||
|
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
use grep_regex::RegexMatcher;
|
||||||
|
use grep_printer::Standard;
|
||||||
|
use grep_searcher::Searcher;
|
||||||
|
|
||||||
|
const SHERLOCK: &'static [u8] = b"\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, success in the province of detective work must always
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
and exhibited clearly, with a label attached.
|
||||||
|
";
|
||||||
|
|
||||||
|
# fn main() { example().unwrap(); }
|
||||||
|
fn example() -> Result<(), Box<Error>> {
|
||||||
|
let matcher = RegexMatcher::new(r"Sherlock")?;
|
||||||
|
let mut printer = Standard::new_no_color(vec![]);
|
||||||
|
Searcher::new().search_slice(&matcher, SHERLOCK, printer.sink(&matcher))?;
|
||||||
|
|
||||||
|
// into_inner gives us back the underlying writer we provided to
|
||||||
|
// new_no_color, which is wrapped in a termcolor::NoColor. Thus, a second
|
||||||
|
// into_inner gives us back the actual buffer.
|
||||||
|
let output = String::from_utf8(printer.into_inner().into_inner())?;
|
||||||
|
let expected = "\
|
||||||
|
1:For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
3:be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
";
|
||||||
|
assert_eq!(output, expected);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
```
|
||||||
|
*/
|
||||||
|
|
||||||
|
#![deny(missing_docs)]
|
||||||
|
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
extern crate base64;
|
||||||
|
extern crate grep_matcher;
|
||||||
|
#[cfg(test)]
|
||||||
|
extern crate grep_regex;
|
||||||
|
extern crate grep_searcher;
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
extern crate serde;
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
#[macro_use]
|
||||||
|
extern crate serde_derive;
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
extern crate serde_json;
|
||||||
|
extern crate termcolor;
|
||||||
|
|
||||||
|
pub use color::{ColorError, ColorSpecs, UserColorSpec, default_color_specs};
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
pub use json::{JSON, JSONBuilder, JSONSink};
|
||||||
|
pub use standard::{Standard, StandardBuilder, StandardSink};
|
||||||
|
pub use stats::Stats;
|
||||||
|
pub use summary::{Summary, SummaryBuilder, SummaryKind, SummarySink};
|
||||||
|
pub use util::PrinterPath;
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
mod macros;
|
||||||
|
|
||||||
|
mod color;
|
||||||
|
mod counter;
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
mod json;
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
mod jsont;
|
||||||
|
mod standard;
|
||||||
|
mod stats;
|
||||||
|
mod summary;
|
||||||
|
mod util;
|
||||||
24
grep-printer/src/macros.rs
Normal file
24
grep-printer/src/macros.rs
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
/// Like assert_eq, but nicer output for long strings.
|
||||||
|
#[cfg(test)]
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! assert_eq_printed {
|
||||||
|
($expected:expr, $got:expr) => {
|
||||||
|
let expected = &*$expected;
|
||||||
|
let got = &*$got;
|
||||||
|
if expected != got {
|
||||||
|
panic!("
|
||||||
|
printed outputs differ!
|
||||||
|
|
||||||
|
expected:
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
{}
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
got:
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
{}
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
", expected, got);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
3053
grep-printer/src/standard.rs
Normal file
3053
grep-printer/src/standard.rs
Normal file
File diff suppressed because it is too large
Load Diff
147
grep-printer/src/stats.rs
Normal file
147
grep-printer/src/stats.rs
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
use std::ops::{Add, AddAssign};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use util::NiceDuration;
|
||||||
|
|
||||||
|
/// Summary statistics produced at the end of a search.
|
||||||
|
///
|
||||||
|
/// When statistics are reported by a printer, they correspond to all searches
|
||||||
|
/// executed with that printer.
|
||||||
|
#[derive(Clone, Debug, Default, PartialEq, Eq)]
|
||||||
|
#[cfg_attr(feature = "serde1", derive(Serialize))]
|
||||||
|
pub struct Stats {
|
||||||
|
elapsed: NiceDuration,
|
||||||
|
searches: u64,
|
||||||
|
searches_with_match: u64,
|
||||||
|
bytes_searched: u64,
|
||||||
|
bytes_printed: u64,
|
||||||
|
matched_lines: u64,
|
||||||
|
matches: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Add for Stats {
|
||||||
|
type Output = Stats;
|
||||||
|
|
||||||
|
fn add(self, rhs: Stats) -> Stats {
|
||||||
|
self + &rhs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Add<&'a Stats> for Stats {
|
||||||
|
type Output = Stats;
|
||||||
|
|
||||||
|
fn add(self, rhs: &'a Stats) -> Stats {
|
||||||
|
Stats {
|
||||||
|
elapsed: NiceDuration(self.elapsed.0 + rhs.elapsed.0),
|
||||||
|
searches: self.searches + rhs.searches,
|
||||||
|
searches_with_match:
|
||||||
|
self.searches_with_match + rhs.searches_with_match,
|
||||||
|
bytes_searched: self.bytes_searched + rhs.bytes_searched,
|
||||||
|
bytes_printed: self.bytes_printed + rhs.bytes_printed,
|
||||||
|
matched_lines: self.matched_lines + rhs.matched_lines,
|
||||||
|
matches: self.matches + rhs.matches,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AddAssign for Stats {
|
||||||
|
fn add_assign(&mut self, rhs: Stats) {
|
||||||
|
*self += &rhs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> AddAssign<&'a Stats> for Stats {
|
||||||
|
fn add_assign(&mut self, rhs: &'a Stats) {
|
||||||
|
self.elapsed.0 += rhs.elapsed.0;
|
||||||
|
self.searches += rhs.searches;
|
||||||
|
self.searches_with_match += rhs.searches_with_match;
|
||||||
|
self.bytes_searched += rhs.bytes_searched;
|
||||||
|
self.bytes_printed += rhs.bytes_printed;
|
||||||
|
self.matched_lines += rhs.matched_lines;
|
||||||
|
self.matches += rhs.matches;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Stats {
|
||||||
|
/// Return a new value for tracking aggregate statistics across searches.
|
||||||
|
///
|
||||||
|
/// All statistics are set to `0`.
|
||||||
|
pub fn new() -> Stats {
|
||||||
|
Stats::default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total amount of time elapsed.
|
||||||
|
pub fn elapsed(&self) -> Duration {
|
||||||
|
self.elapsed.0
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of searches executed.
|
||||||
|
pub fn searches(&self) -> u64 {
|
||||||
|
self.searches
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of searches that found at least one match.
|
||||||
|
pub fn searches_with_match(&self) -> u64 {
|
||||||
|
self.searches_with_match
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of bytes searched.
|
||||||
|
pub fn bytes_searched(&self) -> u64 {
|
||||||
|
self.bytes_searched
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of bytes printed.
|
||||||
|
pub fn bytes_printed(&self) -> u64 {
|
||||||
|
self.bytes_printed
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of lines that participated in a match.
|
||||||
|
///
|
||||||
|
/// When matches may contain multiple lines then this includes every line
|
||||||
|
/// that is part of every match.
|
||||||
|
pub fn matched_lines(&self) -> u64 {
|
||||||
|
self.matched_lines
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the total number of matches.
|
||||||
|
///
|
||||||
|
/// There may be multiple matches per line.
|
||||||
|
pub fn matches(&self) -> u64 {
|
||||||
|
self.matches
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the elapsed time.
|
||||||
|
pub fn add_elapsed(&mut self, duration: Duration) {
|
||||||
|
self.elapsed.0 += duration;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the number of searches executed.
|
||||||
|
pub fn add_searches(&mut self, n: u64) {
|
||||||
|
self.searches += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the number of searches that found at least one match.
|
||||||
|
pub fn add_searches_with_match(&mut self, n: u64) {
|
||||||
|
self.searches_with_match += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the total number of bytes searched.
|
||||||
|
pub fn add_bytes_searched(&mut self, n: u64) {
|
||||||
|
self.bytes_searched += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the total number of bytes printed.
|
||||||
|
pub fn add_bytes_printed(&mut self, n: u64) {
|
||||||
|
self.bytes_printed += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the total number of lines that participated in a match.
|
||||||
|
pub fn add_matched_lines(&mut self, n: u64) {
|
||||||
|
self.matched_lines += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add to the total number of matches.
|
||||||
|
pub fn add_matches(&mut self, n: u64) {
|
||||||
|
self.matches += n;
|
||||||
|
}
|
||||||
|
}
|
||||||
1071
grep-printer/src/summary.rs
Normal file
1071
grep-printer/src/summary.rs
Normal file
File diff suppressed because it is too large
Load Diff
392
grep-printer/src/util.rs
Normal file
392
grep-printer/src/util.rs
Normal file
@@ -0,0 +1,392 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::fmt;
|
||||||
|
use std::io;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::time;
|
||||||
|
|
||||||
|
use grep_matcher::{Captures, LineTerminator, Match, Matcher};
|
||||||
|
use grep_searcher::{
|
||||||
|
LineIter,
|
||||||
|
SinkError, SinkContext, SinkContextKind, SinkMatch,
|
||||||
|
};
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
use serde::{Serialize, Serializer};
|
||||||
|
|
||||||
|
/// A type for handling replacements while amortizing allocation.
|
||||||
|
pub struct Replacer<M: Matcher> {
|
||||||
|
space: Option<Space<M>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Space<M: Matcher> {
|
||||||
|
/// The place to store capture locations.
|
||||||
|
caps: M::Captures,
|
||||||
|
/// The place to write a replacement to.
|
||||||
|
dst: Vec<u8>,
|
||||||
|
/// The place to store match offsets in terms of `dst`.
|
||||||
|
matches: Vec<Match>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<M: Matcher> fmt::Debug for Replacer<M> {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
let (dst, matches) = self.replacement().unwrap_or((&[], &[]));
|
||||||
|
f.debug_struct("Replacer")
|
||||||
|
.field("dst", &dst)
|
||||||
|
.field("matches", &matches)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<M: Matcher> Replacer<M> {
|
||||||
|
/// Create a new replacer for use with a particular matcher.
|
||||||
|
///
|
||||||
|
/// This constructor does not allocate. Instead, space for dealing with
|
||||||
|
/// replacements is allocated lazily only when needed.
|
||||||
|
pub fn new() -> Replacer<M> {
|
||||||
|
Replacer { space: None }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Executes a replacement on the given subject string by replacing all
|
||||||
|
/// matches with the given replacement. To access the result of the
|
||||||
|
/// replacement, use the `replacement` method.
|
||||||
|
///
|
||||||
|
/// This can fail if the underlying matcher reports an error.
|
||||||
|
pub fn replace_all<'a>(
|
||||||
|
&'a mut self,
|
||||||
|
matcher: &M,
|
||||||
|
subject: &[u8],
|
||||||
|
replacement: &[u8],
|
||||||
|
) -> io::Result<()> {
|
||||||
|
{
|
||||||
|
let &mut Space {
|
||||||
|
ref mut dst,
|
||||||
|
ref mut caps,
|
||||||
|
ref mut matches,
|
||||||
|
} = self.allocate(matcher)?;
|
||||||
|
dst.clear();
|
||||||
|
matches.clear();
|
||||||
|
|
||||||
|
matcher.replace_with_captures(
|
||||||
|
subject,
|
||||||
|
caps,
|
||||||
|
dst,
|
||||||
|
|caps, dst| {
|
||||||
|
let start = dst.len();
|
||||||
|
caps.interpolate(
|
||||||
|
|name| matcher.capture_index(name),
|
||||||
|
subject,
|
||||||
|
replacement,
|
||||||
|
dst,
|
||||||
|
);
|
||||||
|
let end = dst.len();
|
||||||
|
matches.push(Match::new(start, end));
|
||||||
|
true
|
||||||
|
},
|
||||||
|
).map_err(io::Error::error_message)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the result of the prior replacement and the match offsets for
|
||||||
|
/// all replacement occurrences within the returned replacement buffer.
|
||||||
|
///
|
||||||
|
/// If no replacement has occurred then `None` is returned.
|
||||||
|
pub fn replacement<'a>(&'a self) -> Option<(&'a [u8], &'a [Match])> {
|
||||||
|
match self.space {
|
||||||
|
None => None,
|
||||||
|
Some(ref space) => {
|
||||||
|
if space.matches.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some((&space.dst, &space.matches))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Clear space used for performing a replacement.
|
||||||
|
///
|
||||||
|
/// Subsequent calls to `replacement` after calling `clear` (but before
|
||||||
|
/// executing another replacement) will always return `None`.
|
||||||
|
pub fn clear(&mut self) {
|
||||||
|
if let Some(ref mut space) = self.space {
|
||||||
|
space.dst.clear();
|
||||||
|
space.matches.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Allocate space for replacements when used with the given matcher and
|
||||||
|
/// return a mutable reference to that space.
|
||||||
|
///
|
||||||
|
/// This can fail if allocating space for capture locations from the given
|
||||||
|
/// matcher fails.
|
||||||
|
fn allocate(&mut self, matcher: &M) -> io::Result<&mut Space<M>> {
|
||||||
|
if self.space.is_none() {
|
||||||
|
let caps = matcher
|
||||||
|
.new_captures()
|
||||||
|
.map_err(io::Error::error_message)?;
|
||||||
|
self.space = Some(Space {
|
||||||
|
caps: caps,
|
||||||
|
dst: vec![],
|
||||||
|
matches: vec![],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(self.space.as_mut().unwrap())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A simple layer of abstraction over either a match or a contextual line
|
||||||
|
/// reported by the searcher.
|
||||||
|
///
|
||||||
|
/// In particular, this provides an API that unions the `SinkMatch` and
|
||||||
|
/// `SinkContext` types while also exposing a list of all individual match
|
||||||
|
/// locations.
|
||||||
|
///
|
||||||
|
/// While this serves as a convenient mechanism to abstract over `SinkMatch`
|
||||||
|
/// and `SinkContext`, this also provides a way to abstract over replacements.
|
||||||
|
/// Namely, after a replacement, a `Sunk` value can be constructed using the
|
||||||
|
/// results of the replacement instead of the bytes reported directly by the
|
||||||
|
/// searcher.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Sunk<'a> {
|
||||||
|
bytes: &'a [u8],
|
||||||
|
absolute_byte_offset: u64,
|
||||||
|
line_number: Option<u64>,
|
||||||
|
context_kind: Option<&'a SinkContextKind>,
|
||||||
|
matches: &'a [Match],
|
||||||
|
original_matches: &'a [Match],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Sunk<'a> {
|
||||||
|
#[inline]
|
||||||
|
pub fn empty() -> Sunk<'static> {
|
||||||
|
Sunk {
|
||||||
|
bytes: &[],
|
||||||
|
absolute_byte_offset: 0,
|
||||||
|
line_number: None,
|
||||||
|
context_kind: None,
|
||||||
|
matches: &[],
|
||||||
|
original_matches: &[],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn from_sink_match(
|
||||||
|
sunk: &'a SinkMatch<'a>,
|
||||||
|
original_matches: &'a [Match],
|
||||||
|
replacement: Option<(&'a [u8], &'a [Match])>,
|
||||||
|
) -> Sunk<'a> {
|
||||||
|
let (bytes, matches) = replacement.unwrap_or_else(|| {
|
||||||
|
(sunk.bytes(), original_matches)
|
||||||
|
});
|
||||||
|
Sunk {
|
||||||
|
bytes: bytes,
|
||||||
|
absolute_byte_offset: sunk.absolute_byte_offset(),
|
||||||
|
line_number: sunk.line_number(),
|
||||||
|
context_kind: None,
|
||||||
|
matches: matches,
|
||||||
|
original_matches: original_matches,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn from_sink_context(
|
||||||
|
sunk: &'a SinkContext<'a>,
|
||||||
|
original_matches: &'a [Match],
|
||||||
|
replacement: Option<(&'a [u8], &'a [Match])>,
|
||||||
|
) -> Sunk<'a> {
|
||||||
|
let (bytes, matches) = replacement.unwrap_or_else(|| {
|
||||||
|
(sunk.bytes(), original_matches)
|
||||||
|
});
|
||||||
|
Sunk {
|
||||||
|
bytes: bytes,
|
||||||
|
absolute_byte_offset: sunk.absolute_byte_offset(),
|
||||||
|
line_number: sunk.line_number(),
|
||||||
|
context_kind: Some(sunk.kind()),
|
||||||
|
matches: matches,
|
||||||
|
original_matches: original_matches,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn context_kind(&self) -> Option<&'a SinkContextKind> {
|
||||||
|
self.context_kind
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn bytes(&self) -> &'a [u8] {
|
||||||
|
self.bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn matches(&self) -> &'a [Match] {
|
||||||
|
self.matches
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn original_matches(&self) -> &'a [Match] {
|
||||||
|
self.original_matches
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn lines(&self, line_term: u8) -> LineIter<'a> {
|
||||||
|
LineIter::new(line_term, self.bytes())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn absolute_byte_offset(&self) -> u64 {
|
||||||
|
self.absolute_byte_offset
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
pub fn line_number(&self) -> Option<u64> {
|
||||||
|
self.line_number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A simple encapsulation of a file path used by a printer.
|
||||||
|
///
|
||||||
|
/// This represents any transforms that we might want to perform on the path,
|
||||||
|
/// such as converting it to valid UTF-8 and/or replacing its separator with
|
||||||
|
/// something else. This allows us to amortize work if we are printing the
|
||||||
|
/// file path for every match.
|
||||||
|
///
|
||||||
|
/// In the common case, no transformation is needed, which lets us avoid the
|
||||||
|
/// allocation. Typically, only Windows requires a transform, since we can't
|
||||||
|
/// access the raw bytes of a path directly and first need to lossily convert
|
||||||
|
/// to UTF-8. Windows is also typically where the path separator replacement
|
||||||
|
/// is used, e.g., in cygwin environments to use `/` instead of `\`.
|
||||||
|
///
|
||||||
|
/// Users of this type are expected to construct it from a normal `Path`
|
||||||
|
/// found in the standard library. It can then be written to any `io::Write`
|
||||||
|
/// implementation using the `as_bytes` method. This achieves platform
|
||||||
|
/// portability with a small cost: on Windows, paths that are not valid UTF-16
|
||||||
|
/// will not roundtrip correctly.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct PrinterPath<'a>(Cow<'a, [u8]>);
|
||||||
|
|
||||||
|
impl<'a> PrinterPath<'a> {
|
||||||
|
/// Create a new path suitable for printing.
|
||||||
|
pub fn new(path: &'a Path) -> PrinterPath<'a> {
|
||||||
|
PrinterPath::new_impl(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(unix)]
|
||||||
|
fn new_impl(path: &'a Path) -> PrinterPath<'a> {
|
||||||
|
use std::os::unix::ffi::OsStrExt;
|
||||||
|
PrinterPath(Cow::Borrowed(path.as_os_str().as_bytes()))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(unix))]
|
||||||
|
fn new_impl(path: &'a Path) -> PrinterPath<'a> {
|
||||||
|
PrinterPath(match path.to_string_lossy() {
|
||||||
|
Cow::Owned(path) => Cow::Owned(path.into_bytes()),
|
||||||
|
Cow::Borrowed(path) => Cow::Borrowed(path.as_bytes()),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new printer path from the given path which can be efficiently
|
||||||
|
/// written to a writer without allocation.
|
||||||
|
///
|
||||||
|
/// If the given separator is present, then any separators in `path` are
|
||||||
|
/// replaced with it.
|
||||||
|
pub fn with_separator(path: &'a Path, sep: Option<u8>) -> PrinterPath<'a> {
|
||||||
|
let mut ppath = PrinterPath::new(path);
|
||||||
|
if let Some(sep) = sep {
|
||||||
|
ppath.replace_separator(sep);
|
||||||
|
}
|
||||||
|
ppath
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replace the path separator in this path with the given separator
|
||||||
|
/// and do it in place. On Windows, both `/` and `\` are treated as
|
||||||
|
/// path separators that are both replaced by `new_sep`. In all other
|
||||||
|
/// environments, only `/` is treated as a path separator.
|
||||||
|
fn replace_separator(&mut self, new_sep: u8) {
|
||||||
|
let transformed_path: Vec<_> = self.as_bytes().iter().map(|&b| {
|
||||||
|
if b == b'/' || (cfg!(windows) && b == b'\\') {
|
||||||
|
new_sep
|
||||||
|
} else {
|
||||||
|
b
|
||||||
|
}
|
||||||
|
}).collect();
|
||||||
|
self.0 = Cow::Owned(transformed_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the raw bytes for this path.
|
||||||
|
pub fn as_bytes(&self) -> &[u8] {
|
||||||
|
&*self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type that provides "nicer" Display and Serialize impls for
|
||||||
|
/// std::time::Duration. The serialization format should actually be compatible
|
||||||
|
/// with the Deserialize impl for std::time::Duration, since this type only
|
||||||
|
/// adds new fields.
|
||||||
|
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
||||||
|
pub struct NiceDuration(pub time::Duration);
|
||||||
|
|
||||||
|
impl fmt::Display for NiceDuration {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
write!(f, "{:0.6}s", self.fractional_seconds())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NiceDuration {
|
||||||
|
/// Returns the number of seconds in this duration in fraction form.
|
||||||
|
/// The number to the left of the decimal point is the number of seconds,
|
||||||
|
/// and the number to the right is the number of milliseconds.
|
||||||
|
fn fractional_seconds(&self) -> f64 {
|
||||||
|
let fractional = (self.0.subsec_nanos() as f64) / 1_000_000_000.0;
|
||||||
|
self.0.as_secs() as f64 + fractional
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(feature = "serde1")]
|
||||||
|
impl Serialize for NiceDuration {
|
||||||
|
fn serialize<S: Serializer>(&self, ser: S) -> Result<S::Ok, S::Error> {
|
||||||
|
use serde::ser::SerializeStruct;
|
||||||
|
|
||||||
|
let mut state = ser.serialize_struct("Duration", 2)?;
|
||||||
|
state.serialize_field("secs", &self.0.as_secs())?;
|
||||||
|
state.serialize_field("nanos", &self.0.subsec_nanos())?;
|
||||||
|
state.serialize_field("human", &format!("{}", self))?;
|
||||||
|
state.end()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Trim prefix ASCII spaces from the given slice and return the corresponding
|
||||||
|
/// range.
|
||||||
|
///
|
||||||
|
/// This stops trimming a prefix as soon as it sees non-whitespace or a line
|
||||||
|
/// terminator.
|
||||||
|
pub fn trim_ascii_prefix_range(
|
||||||
|
line_term: LineTerminator,
|
||||||
|
slice: &[u8],
|
||||||
|
range: Match,
|
||||||
|
) -> Match {
|
||||||
|
fn is_space(b: u8) -> bool {
|
||||||
|
match b {
|
||||||
|
b'\t' | b'\n' | b'\x0B' | b'\x0C' | b'\r' | b' ' => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let count = slice[range]
|
||||||
|
.iter()
|
||||||
|
.take_while(|&&b| -> bool {
|
||||||
|
is_space(b) && !line_term.as_bytes().contains(&b)
|
||||||
|
})
|
||||||
|
.count();
|
||||||
|
range.with_start(range.start() + count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Trim prefix ASCII spaces from the given slice and return the corresponding
|
||||||
|
/// sub-slice.
|
||||||
|
pub fn trim_ascii_prefix(line_term: LineTerminator, slice: &[u8]) -> &[u8] {
|
||||||
|
let range = trim_ascii_prefix_range(
|
||||||
|
line_term,
|
||||||
|
slice,
|
||||||
|
Match::new(0, slice.len()),
|
||||||
|
);
|
||||||
|
&slice[range]
|
||||||
|
}
|
||||||
21
grep-regex/Cargo.toml
Normal file
21
grep-regex/Cargo.toml
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
[package]
|
||||||
|
name = "grep-regex"
|
||||||
|
version = "0.1.2" #:version
|
||||||
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
|
description = """
|
||||||
|
Use Rust's regex library with the 'grep' crate.
|
||||||
|
"""
|
||||||
|
documentation = "https://docs.rs/grep-regex"
|
||||||
|
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
repository = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["regex", "grep", "search", "pattern", "line"]
|
||||||
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
log = "0.4.5"
|
||||||
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
|
regex = "1.1"
|
||||||
|
regex-syntax = "0.6.5"
|
||||||
|
thread_local = "0.3.6"
|
||||||
|
utf8-ranges = "1.0.1"
|
||||||
21
grep-regex/LICENSE-MIT
Normal file
21
grep-regex/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Andrew Gallant
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
35
grep-regex/README.md
Normal file
35
grep-regex/README.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
grep-regex
|
||||||
|
----------
|
||||||
|
The `grep-regex` crate provides an implementation of the `Matcher` trait from
|
||||||
|
the `grep-matcher` crate. This implementation permits Rust's regex engine to
|
||||||
|
be used in the `grep` crate for fast line oriented searching.
|
||||||
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep-regex)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep-regex](https://docs.rs/grep-regex)
|
||||||
|
|
||||||
|
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||||
|
should prefer the facade defined in the
|
||||||
|
[`grep`](https://docs.rs/grep)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep-regex = "0.1"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep_regex;
|
||||||
|
```
|
||||||
24
grep-regex/UNLICENSE
Normal file
24
grep-regex/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
||||||
263
grep-regex/src/ast.rs
Normal file
263
grep-regex/src/ast.rs
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
use regex_syntax::ast::{self, Ast};
|
||||||
|
use regex_syntax::ast::parse::Parser;
|
||||||
|
|
||||||
|
/// The results of analyzing AST of a regular expression (e.g., for supporting
|
||||||
|
/// smart case).
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct AstAnalysis {
|
||||||
|
/// True if and only if a literal uppercase character occurs in the regex.
|
||||||
|
any_uppercase: bool,
|
||||||
|
/// True if and only if the regex contains any literal at all.
|
||||||
|
any_literal: bool,
|
||||||
|
/// True if and only if the regex consists entirely of a literal and no
|
||||||
|
/// other special regex characters.
|
||||||
|
all_verbatim_literal: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AstAnalysis {
|
||||||
|
/// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`.
|
||||||
|
///
|
||||||
|
/// If `pattern` is not a valid regular expression, then `None` is
|
||||||
|
/// returned.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn from_pattern(pattern: &str) -> Option<AstAnalysis> {
|
||||||
|
Parser::new()
|
||||||
|
.parse(pattern)
|
||||||
|
.map(|ast| AstAnalysis::from_ast(&ast))
|
||||||
|
.ok()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform an AST analysis given the AST.
|
||||||
|
pub fn from_ast(ast: &Ast) -> AstAnalysis {
|
||||||
|
let mut analysis = AstAnalysis::new();
|
||||||
|
analysis.from_ast_impl(ast);
|
||||||
|
analysis
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if a literal uppercase character occurs in
|
||||||
|
/// the pattern.
|
||||||
|
///
|
||||||
|
/// For example, a pattern like `\pL` contains no uppercase literals,
|
||||||
|
/// even though `L` is uppercase and the `\pL` class contains uppercase
|
||||||
|
/// characters.
|
||||||
|
pub fn any_uppercase(&self) -> bool {
|
||||||
|
self.any_uppercase
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the regex contains any literal at all.
|
||||||
|
///
|
||||||
|
/// For example, a pattern like `\pL` reports `false`, but a pattern like
|
||||||
|
/// `\pLfoo` reports `true`.
|
||||||
|
pub fn any_literal(&self) -> bool {
|
||||||
|
self.any_literal
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the entire pattern is a verbatim literal
|
||||||
|
/// with no special meta characters.
|
||||||
|
///
|
||||||
|
/// When this is true, then the pattern satisfies the following law:
|
||||||
|
/// `escape(pattern) == pattern`. Notable examples where this returns
|
||||||
|
/// `false` include patterns like `a\u0061` even though `\u0061` is just
|
||||||
|
/// a literal `a`.
|
||||||
|
///
|
||||||
|
/// The purpose of this flag is to determine whether the patterns can be
|
||||||
|
/// given to non-regex substring search algorithms as-is.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn all_verbatim_literal(&self) -> bool {
|
||||||
|
self.all_verbatim_literal
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a new `AstAnalysis` value with an initial configuration.
|
||||||
|
fn new() -> AstAnalysis {
|
||||||
|
AstAnalysis {
|
||||||
|
any_uppercase: false,
|
||||||
|
any_literal: false,
|
||||||
|
all_verbatim_literal: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_impl(&mut self, ast: &Ast) {
|
||||||
|
if self.done() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
match *ast {
|
||||||
|
Ast::Empty(_) => {}
|
||||||
|
Ast::Flags(_)
|
||||||
|
| Ast::Dot(_)
|
||||||
|
| Ast::Assertion(_)
|
||||||
|
| Ast::Class(ast::Class::Unicode(_))
|
||||||
|
| Ast::Class(ast::Class::Perl(_)) => {
|
||||||
|
self.all_verbatim_literal = false;
|
||||||
|
}
|
||||||
|
Ast::Literal(ref x) => {
|
||||||
|
self.from_ast_literal(x);
|
||||||
|
}
|
||||||
|
Ast::Class(ast::Class::Bracketed(ref x)) => {
|
||||||
|
self.all_verbatim_literal = false;
|
||||||
|
self.from_ast_class_set(&x.kind);
|
||||||
|
}
|
||||||
|
Ast::Repetition(ref x) => {
|
||||||
|
self.all_verbatim_literal = false;
|
||||||
|
self.from_ast_impl(&x.ast);
|
||||||
|
}
|
||||||
|
Ast::Group(ref x) => {
|
||||||
|
self.all_verbatim_literal = false;
|
||||||
|
self.from_ast_impl(&x.ast);
|
||||||
|
}
|
||||||
|
Ast::Alternation(ref alt) => {
|
||||||
|
self.all_verbatim_literal = false;
|
||||||
|
for x in &alt.asts {
|
||||||
|
self.from_ast_impl(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ast::Concat(ref alt) => {
|
||||||
|
for x in &alt.asts {
|
||||||
|
self.from_ast_impl(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
|
||||||
|
if self.done() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
match *ast {
|
||||||
|
ast::ClassSet::Item(ref item) => {
|
||||||
|
self.from_ast_class_set_item(item);
|
||||||
|
}
|
||||||
|
ast::ClassSet::BinaryOp(ref x) => {
|
||||||
|
self.from_ast_class_set(&x.lhs);
|
||||||
|
self.from_ast_class_set(&x.rhs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
|
||||||
|
if self.done() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
match *ast {
|
||||||
|
ast::ClassSetItem::Empty(_)
|
||||||
|
| ast::ClassSetItem::Ascii(_)
|
||||||
|
| ast::ClassSetItem::Unicode(_)
|
||||||
|
| ast::ClassSetItem::Perl(_) => {}
|
||||||
|
ast::ClassSetItem::Literal(ref x) => {
|
||||||
|
self.from_ast_literal(x);
|
||||||
|
}
|
||||||
|
ast::ClassSetItem::Range(ref x) => {
|
||||||
|
self.from_ast_literal(&x.start);
|
||||||
|
self.from_ast_literal(&x.end);
|
||||||
|
}
|
||||||
|
ast::ClassSetItem::Bracketed(ref x) => {
|
||||||
|
self.from_ast_class_set(&x.kind);
|
||||||
|
}
|
||||||
|
ast::ClassSetItem::Union(ref union) => {
|
||||||
|
for x in &union.items {
|
||||||
|
self.from_ast_class_set_item(x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_ast_literal(&mut self, ast: &ast::Literal) {
|
||||||
|
if ast.kind != ast::LiteralKind::Verbatim {
|
||||||
|
self.all_verbatim_literal = false;
|
||||||
|
}
|
||||||
|
self.any_literal = true;
|
||||||
|
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the attributes can never change no matter
|
||||||
|
/// what other AST it might see.
|
||||||
|
fn done(&self) -> bool {
|
||||||
|
self.any_uppercase && self.any_literal && !self.all_verbatim_literal
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn analysis(pattern: &str) -> AstAnalysis {
|
||||||
|
AstAnalysis::from_pattern(pattern).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn various() {
|
||||||
|
let x = analysis("");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(!x.any_literal);
|
||||||
|
assert!(x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis("foo");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis("Foo");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis("foO");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo\\");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo\w");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo\S");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo\p{Ll}");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo[a-z]");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo[A-Z]");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo[\S\t]");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"foo\\S");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"\p{Ll}");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(!x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"aBc\w");
|
||||||
|
assert!(x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
|
||||||
|
let x = analysis(r"a\u0061");
|
||||||
|
assert!(!x.any_uppercase);
|
||||||
|
assert!(x.any_literal);
|
||||||
|
assert!(!x.all_verbatim_literal);
|
||||||
|
}
|
||||||
|
}
|
||||||
273
grep-regex/src/config.rs
Normal file
273
grep-regex/src/config.rs
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
use grep_matcher::{ByteSet, LineTerminator};
|
||||||
|
use regex::bytes::{Regex, RegexBuilder};
|
||||||
|
use regex_syntax::ast::{self, Ast};
|
||||||
|
use regex_syntax::hir::Hir;
|
||||||
|
|
||||||
|
use ast::AstAnalysis;
|
||||||
|
use crlf::crlfify;
|
||||||
|
use error::Error;
|
||||||
|
use literal::LiteralSets;
|
||||||
|
use non_matching::non_matching_bytes;
|
||||||
|
use strip::strip_from_match;
|
||||||
|
|
||||||
|
/// Config represents the configuration of a regex matcher in this crate.
|
||||||
|
/// The configuration is itself a rough combination of the knobs found in
|
||||||
|
/// the `regex` crate itself, along with additional `grep-matcher` specific
|
||||||
|
/// options.
|
||||||
|
///
|
||||||
|
/// The configuration can be used to build a "configured" HIR expression. A
|
||||||
|
/// configured HIR expression is an HIR expression that is aware of the
|
||||||
|
/// configuration which generated it, and provides transformation on that HIR
|
||||||
|
/// such that the configuration is preserved.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Config {
|
||||||
|
pub case_insensitive: bool,
|
||||||
|
pub case_smart: bool,
|
||||||
|
pub multi_line: bool,
|
||||||
|
pub dot_matches_new_line: bool,
|
||||||
|
pub swap_greed: bool,
|
||||||
|
pub ignore_whitespace: bool,
|
||||||
|
pub unicode: bool,
|
||||||
|
pub octal: bool,
|
||||||
|
pub size_limit: usize,
|
||||||
|
pub dfa_size_limit: usize,
|
||||||
|
pub nest_limit: u32,
|
||||||
|
pub line_terminator: Option<LineTerminator>,
|
||||||
|
pub crlf: bool,
|
||||||
|
pub word: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Config {
|
||||||
|
fn default() -> Config {
|
||||||
|
Config {
|
||||||
|
case_insensitive: false,
|
||||||
|
case_smart: false,
|
||||||
|
multi_line: false,
|
||||||
|
dot_matches_new_line: false,
|
||||||
|
swap_greed: false,
|
||||||
|
ignore_whitespace: false,
|
||||||
|
unicode: true,
|
||||||
|
octal: false,
|
||||||
|
// These size limits are much bigger than what's in the regex
|
||||||
|
// crate.
|
||||||
|
size_limit: 100 * (1<<20),
|
||||||
|
dfa_size_limit: 1000 * (1<<20),
|
||||||
|
nest_limit: 250,
|
||||||
|
line_terminator: None,
|
||||||
|
crlf: false,
|
||||||
|
word: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
/// Parse the given pattern and returned its HIR expression along with
|
||||||
|
/// the current configuration.
|
||||||
|
///
|
||||||
|
/// If there was a problem parsing the given expression then an error
|
||||||
|
/// is returned.
|
||||||
|
pub fn hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
||||||
|
let analysis = self.analysis(pattern)?;
|
||||||
|
let expr = ::regex_syntax::ParserBuilder::new()
|
||||||
|
.nest_limit(self.nest_limit)
|
||||||
|
.octal(self.octal)
|
||||||
|
.allow_invalid_utf8(true)
|
||||||
|
.ignore_whitespace(self.ignore_whitespace)
|
||||||
|
.case_insensitive(self.is_case_insensitive(&analysis)?)
|
||||||
|
.multi_line(self.multi_line)
|
||||||
|
.dot_matches_new_line(self.dot_matches_new_line)
|
||||||
|
.swap_greed(self.swap_greed)
|
||||||
|
.unicode(self.unicode)
|
||||||
|
.build()
|
||||||
|
.parse(pattern)
|
||||||
|
.map_err(Error::regex)?;
|
||||||
|
let expr = match self.line_terminator {
|
||||||
|
None => expr,
|
||||||
|
Some(line_term) => strip_from_match(expr, line_term)?,
|
||||||
|
};
|
||||||
|
Ok(ConfiguredHIR {
|
||||||
|
original: pattern.to_string(),
|
||||||
|
config: self.clone(),
|
||||||
|
analysis: analysis,
|
||||||
|
// If CRLF mode is enabled, replace `$` with `(?:\r?$)`.
|
||||||
|
expr: if self.crlf { crlfify(expr) } else { expr },
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Accounting for the `smart_case` config knob, return true if and only if
|
||||||
|
/// this pattern should be matched case insensitively.
|
||||||
|
fn is_case_insensitive(
|
||||||
|
&self,
|
||||||
|
analysis: &AstAnalysis,
|
||||||
|
) -> Result<bool, Error> {
|
||||||
|
if self.case_insensitive {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
if !self.case_smart {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
Ok(analysis.any_literal() && !analysis.any_uppercase())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Perform analysis on the AST of this pattern.
|
||||||
|
///
|
||||||
|
/// This returns an error if the given pattern failed to parse.
|
||||||
|
fn analysis(&self, pattern: &str) -> Result<AstAnalysis, Error> {
|
||||||
|
Ok(AstAnalysis::from_ast(&self.ast(pattern)?))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse the given pattern into its abstract syntax.
|
||||||
|
///
|
||||||
|
/// This returns an error if the given pattern failed to parse.
|
||||||
|
fn ast(&self, pattern: &str) -> Result<Ast, Error> {
|
||||||
|
ast::parse::ParserBuilder::new()
|
||||||
|
.nest_limit(self.nest_limit)
|
||||||
|
.octal(self.octal)
|
||||||
|
.ignore_whitespace(self.ignore_whitespace)
|
||||||
|
.build()
|
||||||
|
.parse(pattern)
|
||||||
|
.map_err(Error::regex)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A "configured" HIR expression, which is aware of the configuration which
|
||||||
|
/// produced this HIR.
|
||||||
|
///
|
||||||
|
/// Since the configuration is tracked, values with this type can be
|
||||||
|
/// transformed into other HIR expressions (or regular expressions) in a way
|
||||||
|
/// that preserves the configuration. For example, the `fast_line_regex`
|
||||||
|
/// method will apply literal extraction to the inner HIR and use that to build
|
||||||
|
/// a new regex that matches the extracted literals in a way that is
|
||||||
|
/// consistent with the configuration that produced this HIR. For example, the
|
||||||
|
/// size limits set on the configured HIR will be propagated out to any
|
||||||
|
/// subsequently constructed HIR or regular expression.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct ConfiguredHIR {
|
||||||
|
original: String,
|
||||||
|
config: Config,
|
||||||
|
analysis: AstAnalysis,
|
||||||
|
expr: Hir,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ConfiguredHIR {
|
||||||
|
/// Return the configuration for this HIR expression.
|
||||||
|
pub fn config(&self) -> &Config {
|
||||||
|
&self.config
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the set of non-matching bytes for this HIR expression.
|
||||||
|
pub fn non_matching_bytes(&self) -> ByteSet {
|
||||||
|
non_matching_bytes(&self.expr)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if this regex needs to have its match offsets
|
||||||
|
/// tweaked because of CRLF support. Specifically, this occurs when the
|
||||||
|
/// CRLF hack is enabled and the regex is line anchored at the end. In
|
||||||
|
/// this case, matches that end with a `\r` have the `\r` stripped.
|
||||||
|
pub fn needs_crlf_stripped(&self) -> bool {
|
||||||
|
self.config.crlf && self.expr.is_line_anchored_end()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Builds a regular expression from this HIR expression.
|
||||||
|
pub fn regex(&self) -> Result<Regex, Error> {
|
||||||
|
self.pattern_to_regex(&self.expr.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Applies the given function to the concrete syntax of this HIR and then
|
||||||
|
/// generates a new HIR based on the result of the function in a way that
|
||||||
|
/// preserves the configuration.
|
||||||
|
///
|
||||||
|
/// For example, this can be used to wrap a user provided regular
|
||||||
|
/// expression with additional semantics. e.g., See the `WordMatcher`.
|
||||||
|
pub fn with_pattern<F: FnMut(&str) -> String>(
|
||||||
|
&self,
|
||||||
|
mut f: F,
|
||||||
|
) -> Result<ConfiguredHIR, Error>
|
||||||
|
{
|
||||||
|
self.pattern_to_hir(&f(&self.expr.to_string()))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the current configuration has a line terminator set and if useful
|
||||||
|
/// literals could be extracted, then a regular expression matching those
|
||||||
|
/// literals is returned. If no line terminator is set, then `None` is
|
||||||
|
/// returned.
|
||||||
|
///
|
||||||
|
/// If compiling the resulting regular expression failed, then an error
|
||||||
|
/// is returned.
|
||||||
|
///
|
||||||
|
/// This method only returns something when a line terminator is set
|
||||||
|
/// because matches from this regex are generally candidates that must be
|
||||||
|
/// confirmed before reporting a match. When performing a line oriented
|
||||||
|
/// search, confirmation is easy: just extend the candidate match to its
|
||||||
|
/// respective line boundaries and then re-search that line for a full
|
||||||
|
/// match. This only works when the line terminator is set because the line
|
||||||
|
/// terminator setting guarantees that the regex itself can never match
|
||||||
|
/// through the line terminator byte.
|
||||||
|
pub fn fast_line_regex(&self) -> Result<Option<Regex>, Error> {
|
||||||
|
if self.config.line_terminator.is_none() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
match LiteralSets::new(&self.expr).one_regex() {
|
||||||
|
None => Ok(None),
|
||||||
|
Some(pattern) => self.pattern_to_regex(&pattern).map(Some),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a regex from the given pattern using this HIR's configuration.
|
||||||
|
fn pattern_to_regex(&self, pattern: &str) -> Result<Regex, Error> {
|
||||||
|
// The settings we explicitly set here are intentionally a subset
|
||||||
|
// of the settings we have. The key point here is that our HIR
|
||||||
|
// expression is computed with the settings in mind, such that setting
|
||||||
|
// them here could actually lead to unintended behavior. For example,
|
||||||
|
// consider the pattern `(?U)a+`. This will get folded into the HIR
|
||||||
|
// as a non-greedy repetition operator which will in turn get printed
|
||||||
|
// to the concrete syntax as `a+?`, which is correct. But if we
|
||||||
|
// set the `swap_greed` option again, then we'll wind up with `(?U)a+?`
|
||||||
|
// which is equal to `a+` which is not the same as what we were given.
|
||||||
|
//
|
||||||
|
// We also don't need to apply `case_insensitive` since this gets
|
||||||
|
// folded into the HIR and would just cause us to do redundant work.
|
||||||
|
//
|
||||||
|
// Finally, we don't need to set `ignore_whitespace` since the concrete
|
||||||
|
// syntax emitted by the HIR printer never needs it.
|
||||||
|
//
|
||||||
|
// We set the rest of the options. Some of them are important, such as
|
||||||
|
// the size limit, and some of them are necessary to preserve the
|
||||||
|
// intention of the original pattern. For example, the Unicode flag
|
||||||
|
// will impact how the WordMatcher functions, namely, whether its
|
||||||
|
// word boundaries are Unicode aware or not.
|
||||||
|
RegexBuilder::new(&pattern)
|
||||||
|
.nest_limit(self.config.nest_limit)
|
||||||
|
.octal(self.config.octal)
|
||||||
|
.multi_line(self.config.multi_line)
|
||||||
|
.dot_matches_new_line(self.config.dot_matches_new_line)
|
||||||
|
.unicode(self.config.unicode)
|
||||||
|
.size_limit(self.config.size_limit)
|
||||||
|
.dfa_size_limit(self.config.dfa_size_limit)
|
||||||
|
.build()
|
||||||
|
.map_err(Error::regex)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an HIR expression from the given pattern using this HIR's
|
||||||
|
/// configuration.
|
||||||
|
fn pattern_to_hir(&self, pattern: &str) -> Result<ConfiguredHIR, Error> {
|
||||||
|
// See `pattern_to_regex` comment for explanation of why we only set
|
||||||
|
// a subset of knobs here. e.g., `swap_greed` is explicitly left out.
|
||||||
|
let expr = ::regex_syntax::ParserBuilder::new()
|
||||||
|
.nest_limit(self.config.nest_limit)
|
||||||
|
.octal(self.config.octal)
|
||||||
|
.allow_invalid_utf8(true)
|
||||||
|
.multi_line(self.config.multi_line)
|
||||||
|
.dot_matches_new_line(self.config.dot_matches_new_line)
|
||||||
|
.unicode(self.config.unicode)
|
||||||
|
.build()
|
||||||
|
.parse(pattern)
|
||||||
|
.map_err(Error::regex)?;
|
||||||
|
Ok(ConfiguredHIR {
|
||||||
|
original: self.original.clone(),
|
||||||
|
config: self.config.clone(),
|
||||||
|
analysis: self.analysis.clone(),
|
||||||
|
expr: expr,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
183
grep-regex/src/crlf.rs
Normal file
183
grep-regex/src/crlf.rs
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use grep_matcher::{Match, Matcher, NoError};
|
||||||
|
use regex::bytes::Regex;
|
||||||
|
use regex_syntax::hir::{self, Hir, HirKind};
|
||||||
|
|
||||||
|
use config::ConfiguredHIR;
|
||||||
|
use error::Error;
|
||||||
|
use matcher::RegexCaptures;
|
||||||
|
|
||||||
|
/// A matcher for implementing "word match" semantics.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct CRLFMatcher {
|
||||||
|
/// The regex.
|
||||||
|
regex: Regex,
|
||||||
|
/// A map from capture group name to capture group index.
|
||||||
|
names: HashMap<String, usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CRLFMatcher {
|
||||||
|
/// Create a new matcher from the given pattern that strips `\r` from the
|
||||||
|
/// end of every match.
|
||||||
|
///
|
||||||
|
/// This panics if the given expression doesn't need its CRLF stripped.
|
||||||
|
pub fn new(expr: &ConfiguredHIR) -> Result<CRLFMatcher, Error> {
|
||||||
|
assert!(expr.needs_crlf_stripped());
|
||||||
|
|
||||||
|
let regex = expr.regex()?;
|
||||||
|
let mut names = HashMap::new();
|
||||||
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
||||||
|
if let Some(name) = optional_name {
|
||||||
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(CRLFMatcher { regex, names })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher for CRLFMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, NoError> {
|
||||||
|
let m = match self.regex.find_at(haystack, at) {
|
||||||
|
None => return Ok(None),
|
||||||
|
Some(m) => Match::new(m.start(), m.end()),
|
||||||
|
};
|
||||||
|
Ok(Some(adjust_match(haystack, m)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
|
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
self.regex.captures_len().checked_sub(1).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
self.names.get(name).map(|i| *i)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
caps.strip_crlf(false);
|
||||||
|
let r = self.regex.captures_read_at(caps.locations(), haystack, at);
|
||||||
|
if !r.is_some() {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the end of our match includes a `\r`, then strip it from all
|
||||||
|
// capture groups ending at the same location.
|
||||||
|
let end = caps.locations().get(0).unwrap().1;
|
||||||
|
if end > 0 && haystack.get(end - 1) == Some(&b'\r') {
|
||||||
|
caps.strip_crlf(true);
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We specifically do not implement other methods like find_iter or
|
||||||
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
||||||
|
// by virtue of implementing find_at and captures_at above.
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If the given match ends with a `\r`, then return a new match that ends
|
||||||
|
/// immediately before the `\r`.
|
||||||
|
pub fn adjust_match(haystack: &[u8], m: Match) -> Match {
|
||||||
|
if m.end() > 0 && haystack.get(m.end() - 1) == Some(&b'\r') {
|
||||||
|
m.with_end(m.end() - 1)
|
||||||
|
} else {
|
||||||
|
m
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`.
|
||||||
|
///
|
||||||
|
/// This does not preserve the exact semantics of the given expression,
|
||||||
|
/// however, it does have the useful property that anything that matched the
|
||||||
|
/// given expression will also match the returned expression. The difference is
|
||||||
|
/// that the returned expression can match possibly other things as well.
|
||||||
|
///
|
||||||
|
/// The principle reason why we do this is because the underlying regex engine
|
||||||
|
/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that
|
||||||
|
/// level, but we perform this kludge in the mean time.
|
||||||
|
///
|
||||||
|
/// Note that while the match preserving semantics are nice and neat, the
|
||||||
|
/// match position semantics are quite a bit messier. Namely, `$` only ever
|
||||||
|
/// matches the position between characters where as `\r??` can match a
|
||||||
|
/// character and change the offset. This is regretable, but works out pretty
|
||||||
|
/// nicely in most cases, especially when a match is limited to a single line.
|
||||||
|
pub fn crlfify(expr: Hir) -> Hir {
|
||||||
|
match expr.into_kind() {
|
||||||
|
HirKind::Anchor(hir::Anchor::EndLine) => {
|
||||||
|
let concat = Hir::concat(vec![
|
||||||
|
Hir::repetition(hir::Repetition {
|
||||||
|
kind: hir::RepetitionKind::ZeroOrOne,
|
||||||
|
greedy: false,
|
||||||
|
hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))),
|
||||||
|
}),
|
||||||
|
Hir::anchor(hir::Anchor::EndLine),
|
||||||
|
]);
|
||||||
|
Hir::group(hir::Group {
|
||||||
|
kind: hir::GroupKind::NonCapturing,
|
||||||
|
hir: Box::new(concat),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
HirKind::Empty => Hir::empty(),
|
||||||
|
HirKind::Literal(x) => Hir::literal(x),
|
||||||
|
HirKind::Class(x) => Hir::class(x),
|
||||||
|
HirKind::Anchor(x) => Hir::anchor(x),
|
||||||
|
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
||||||
|
HirKind::Repetition(mut x) => {
|
||||||
|
x.hir = Box::new(crlfify(*x.hir));
|
||||||
|
Hir::repetition(x)
|
||||||
|
}
|
||||||
|
HirKind::Group(mut x) => {
|
||||||
|
x.hir = Box::new(crlfify(*x.hir));
|
||||||
|
Hir::group(x)
|
||||||
|
}
|
||||||
|
HirKind::Concat(xs) => {
|
||||||
|
Hir::concat(xs.into_iter().map(crlfify).collect())
|
||||||
|
}
|
||||||
|
HirKind::Alternation(xs) => {
|
||||||
|
Hir::alternation(xs.into_iter().map(crlfify).collect())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use regex_syntax::Parser;
|
||||||
|
use super::crlfify;
|
||||||
|
|
||||||
|
fn roundtrip(pattern: &str) -> String {
|
||||||
|
let expr1 = Parser::new().parse(pattern).unwrap();
|
||||||
|
let expr2 = crlfify(expr1);
|
||||||
|
expr2.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn various() {
|
||||||
|
assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))");
|
||||||
|
assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))");
|
||||||
|
assert_eq!(
|
||||||
|
roundtrip(r"(?m)(?:foo$|bar$)"),
|
||||||
|
"(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))"
|
||||||
|
);
|
||||||
|
assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a");
|
||||||
|
|
||||||
|
// Not a multiline `$`, so no crlfifying occurs.
|
||||||
|
assert_eq!(roundtrip(r"$"), "\\z");
|
||||||
|
// It's a literal, derp.
|
||||||
|
assert_eq!(roundtrip(r"\$"), "\\$");
|
||||||
|
}
|
||||||
|
}
|
||||||
88
grep-regex/src/error.rs
Normal file
88
grep-regex/src/error.rs
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
use std::error;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
|
use util;
|
||||||
|
|
||||||
|
/// An error that can occur in this crate.
|
||||||
|
///
|
||||||
|
/// Generally, this error corresponds to problems building a regular
|
||||||
|
/// expression, whether it's in parsing, compilation or a problem with
|
||||||
|
/// guaranteeing a configured optimization.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Error {
|
||||||
|
kind: ErrorKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Error {
|
||||||
|
pub(crate) fn new(kind: ErrorKind) -> Error {
|
||||||
|
Error { kind }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn regex<E: error::Error>(err: E) -> Error {
|
||||||
|
Error { kind: ErrorKind::Regex(err.to_string()) }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the kind of this error.
|
||||||
|
pub fn kind(&self) -> &ErrorKind {
|
||||||
|
&self.kind
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The kind of an error that can occur.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum ErrorKind {
|
||||||
|
/// An error that occurred as a result of parsing a regular expression.
|
||||||
|
/// This can be a syntax error or an error that results from attempting to
|
||||||
|
/// compile a regular expression that is too big.
|
||||||
|
///
|
||||||
|
/// The string here is the underlying error converted to a string.
|
||||||
|
Regex(String),
|
||||||
|
/// An error that occurs when a building a regex that isn't permitted to
|
||||||
|
/// match a line terminator. In general, building the regex will do its
|
||||||
|
/// best to make matching a line terminator impossible (e.g., by removing
|
||||||
|
/// `\n` from the `\s` character class), but if the regex contains a
|
||||||
|
/// `\n` literal, then there is no reasonable choice that can be made and
|
||||||
|
/// therefore an error is reported.
|
||||||
|
///
|
||||||
|
/// The string is the literal sequence found in the regex that is not
|
||||||
|
/// allowed.
|
||||||
|
NotAllowed(String),
|
||||||
|
/// This error occurs when a non-ASCII line terminator was provided.
|
||||||
|
///
|
||||||
|
/// The invalid byte is included in this error.
|
||||||
|
InvalidLineTerminator(u8),
|
||||||
|
/// Hints that destructuring should not be exhaustive.
|
||||||
|
///
|
||||||
|
/// This enum may grow additional variants, so this makes sure clients
|
||||||
|
/// don't count on exhaustive matching. (Otherwise, adding a new variant
|
||||||
|
/// could break existing code.)
|
||||||
|
#[doc(hidden)]
|
||||||
|
__Nonexhaustive,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl error::Error for Error {
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
match self.kind {
|
||||||
|
ErrorKind::Regex(_) => "regex error",
|
||||||
|
ErrorKind::NotAllowed(_) => "literal not allowed",
|
||||||
|
ErrorKind::InvalidLineTerminator(_) => "invalid line terminator",
|
||||||
|
ErrorKind::__Nonexhaustive => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for Error {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
match self.kind {
|
||||||
|
ErrorKind::Regex(ref s) => write!(f, "{}", s),
|
||||||
|
ErrorKind::NotAllowed(ref lit) => {
|
||||||
|
write!(f, "the literal '{:?}' is not allowed in a regex", lit)
|
||||||
|
}
|
||||||
|
ErrorKind::InvalidLineTerminator(byte) => {
|
||||||
|
let x = util::show_bytes(&[byte]);
|
||||||
|
write!(f, "line terminators must be ASCII, but '{}' is not", x)
|
||||||
|
}
|
||||||
|
ErrorKind::__Nonexhaustive => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
27
grep-regex/src/lib.rs
Normal file
27
grep-regex/src/lib.rs
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
/*!
|
||||||
|
An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#![deny(missing_docs)]
|
||||||
|
|
||||||
|
extern crate grep_matcher;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate log;
|
||||||
|
extern crate regex;
|
||||||
|
extern crate regex_syntax;
|
||||||
|
extern crate thread_local;
|
||||||
|
extern crate utf8_ranges;
|
||||||
|
|
||||||
|
pub use error::{Error, ErrorKind};
|
||||||
|
pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder};
|
||||||
|
|
||||||
|
mod ast;
|
||||||
|
mod config;
|
||||||
|
mod crlf;
|
||||||
|
mod error;
|
||||||
|
mod literal;
|
||||||
|
mod matcher;
|
||||||
|
mod non_matching;
|
||||||
|
mod strip;
|
||||||
|
mod util;
|
||||||
|
mod word;
|
||||||
@@ -1,27 +1,37 @@
|
|||||||
/*!
|
/*
|
||||||
The literals module is responsible for extracting *inner* literals out of the
|
This module is responsible for extracting *inner* literals out of the AST of a
|
||||||
AST of a regular expression. Normally this is the job of the regex engine
|
regular expression. Normally this is the job of the regex engine itself, but
|
||||||
itself, but the regex engine doesn't look for inner literals. Since we're doing
|
the regex engine doesn't look for inner literals. Since we're doing line based
|
||||||
line based searching, we can use them, so we need to do it ourselves.
|
searching, we can use them, so we need to do it ourselves.
|
||||||
|
|
||||||
Note that this implementation is incredibly suspicious. We need something more
|
|
||||||
principled.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use std::cmp;
|
use std::cmp;
|
||||||
|
|
||||||
use regex::bytes::RegexBuilder;
|
use regex_syntax::hir::{self, Hir, HirKind};
|
||||||
use syntax::hir::{self, Hir, HirKind};
|
use regex_syntax::hir::literal::{Literal, Literals};
|
||||||
use syntax::hir::literal::{Literal, Literals};
|
|
||||||
|
|
||||||
|
use util;
|
||||||
|
|
||||||
|
/// Represents prefix, suffix and inner "required" literals for a regular
|
||||||
|
/// expression.
|
||||||
|
///
|
||||||
|
/// Prefixes and suffixes are detected using regex-syntax. The inner required
|
||||||
|
/// literals are detected using something custom (but based on the code in
|
||||||
|
/// regex-syntax).
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct LiteralSets {
|
pub struct LiteralSets {
|
||||||
|
/// A set of prefix literals.
|
||||||
prefixes: Literals,
|
prefixes: Literals,
|
||||||
|
/// A set of suffix literals.
|
||||||
suffixes: Literals,
|
suffixes: Literals,
|
||||||
|
/// A set of literals such that at least one of them must appear in every
|
||||||
|
/// match. A literal in this set may be neither a prefix nor a suffix.
|
||||||
required: Literals,
|
required: Literals,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LiteralSets {
|
impl LiteralSets {
|
||||||
pub fn create(expr: &Hir) -> Self {
|
/// Create a set of literals from the given HIR expression.
|
||||||
|
pub fn new(expr: &Hir) -> LiteralSets {
|
||||||
let mut required = Literals::empty();
|
let mut required = Literals::empty();
|
||||||
union_required(expr, &mut required);
|
union_required(expr, &mut required);
|
||||||
LiteralSets {
|
LiteralSets {
|
||||||
@@ -31,10 +41,23 @@ impl LiteralSets {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn to_regex_builder(&self) -> Option<RegexBuilder> {
|
/// If it is deemed advantageuous to do so (via various suspicious
|
||||||
|
/// heuristics), this will return a single regular expression pattern that
|
||||||
|
/// matches a subset of the language matched by the regular expression that
|
||||||
|
/// generated these literal sets. The idea here is that the pattern
|
||||||
|
/// returned by this method is much cheaper to search for. i.e., It is
|
||||||
|
/// usually a single literal or an alternation of literals.
|
||||||
|
pub fn one_regex(&self) -> Option<String> {
|
||||||
|
// TODO: The logic in this function is basically inscrutable. It grew
|
||||||
|
// organically in the old grep 0.1 crate. Ideally, it would be
|
||||||
|
// re-worked. In fact, the entire inner literal extraction should be
|
||||||
|
// re-worked. Actually, most of regex-syntax's literal extraction
|
||||||
|
// should also be re-worked. Alas... only so much time in the day.
|
||||||
|
|
||||||
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
|
if self.prefixes.all_complete() && !self.prefixes.is_empty() {
|
||||||
debug!("literal prefixes detected: {:?}", self.prefixes);
|
debug!("literal prefixes detected: {:?}", self.prefixes);
|
||||||
// When this is true, the regex engine will do a literal scan.
|
// When this is true, the regex engine will do a literal scan,
|
||||||
|
// so we don't need to return anything.
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -67,16 +90,6 @@ impl LiteralSets {
|
|||||||
lit = req;
|
lit = req;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Special case: if we have any literals that are all whitespace,
|
|
||||||
// then this is probably a failing of the literal detection since
|
|
||||||
// whitespace is typically pretty common. In this case, don't bother
|
|
||||||
// with inner literal scanning at all and just defer to the regex.
|
|
||||||
let any_all_white = req_lits.iter()
|
|
||||||
.any(|lit| lit.iter().all(|&b| (b as char).is_whitespace()));
|
|
||||||
if any_all_white {
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Special case: if we detected an alternation of inner required
|
// Special case: if we detected an alternation of inner required
|
||||||
// literals and its longest literal is bigger than the longest
|
// literals and its longest literal is bigger than the longest
|
||||||
// prefix/suffix, then choose the alternation. In practice, this
|
// prefix/suffix, then choose the alternation. In practice, this
|
||||||
@@ -85,18 +98,17 @@ impl LiteralSets {
|
|||||||
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
|
let any_empty = req_lits.iter().any(|lit| lit.is_empty());
|
||||||
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
|
if req.len() > lit.len() && req_lits.len() > 1 && !any_empty {
|
||||||
debug!("required literals found: {:?}", req_lits);
|
debug!("required literals found: {:?}", req_lits);
|
||||||
let alts: Vec<String> =
|
let alts: Vec<String> = req_lits
|
||||||
req_lits.into_iter().map(|x| bytes_to_regex(x)).collect();
|
.into_iter()
|
||||||
let mut builder = RegexBuilder::new(&alts.join("|"));
|
.map(|x| util::bytes_to_regex(x))
|
||||||
builder.unicode(false);
|
.collect();
|
||||||
Some(builder)
|
// We're matching raw bytes, so disable Unicode mode.
|
||||||
|
Some(format!("(?-u:{})", alts.join("|")))
|
||||||
} else if lit.is_empty() {
|
} else if lit.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
debug!("required literal found: {:?}", show(lit));
|
debug!("required literal found: {:?}", util::show_bytes(lit));
|
||||||
let mut builder = RegexBuilder::new(&bytes_to_regex(&lit));
|
Some(format!("(?-u:{})", util::bytes_to_regex(&lit)))
|
||||||
builder.unicode(false);
|
|
||||||
Some(builder)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -154,10 +166,10 @@ fn union_required(expr: &Hir, lits: &mut Literals) {
|
|||||||
lits.cut();
|
lits.cut();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if lits2.contains_empty() {
|
if lits2.contains_empty() || !is_simple(&e) {
|
||||||
lits.cut();
|
lits.cut();
|
||||||
}
|
}
|
||||||
if !lits.cross_product(&lits2) {
|
if !lits.cross_product(&lits2) || !lits2.any_complete() {
|
||||||
// If this expression couldn't yield any literal that
|
// If this expression couldn't yield any literal that
|
||||||
// could be extended, then we need to quit. Since we're
|
// could be extended, then we need to quit. Since we're
|
||||||
// short-circuiting, we also need to freeze every member.
|
// short-circuiting, we also need to freeze every member.
|
||||||
@@ -238,6 +250,20 @@ fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn is_simple(expr: &Hir) -> bool {
|
||||||
|
match *expr.kind() {
|
||||||
|
HirKind::Empty
|
||||||
|
| HirKind::Literal(_)
|
||||||
|
| HirKind::Class(_)
|
||||||
|
| HirKind::Repetition(_)
|
||||||
|
| HirKind::Concat(_)
|
||||||
|
| HirKind::Alternation(_) => true,
|
||||||
|
HirKind::Anchor(_)
|
||||||
|
| HirKind::WordBoundary(_)
|
||||||
|
| HirKind::Group(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the number of characters in the given class.
|
/// Return the number of characters in the given class.
|
||||||
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
|
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
|
||||||
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
||||||
@@ -248,27 +274,53 @@ fn count_byte_class(cls: &hir::ClassBytes) -> u32 {
|
|||||||
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts an arbitrary sequence of bytes to a literal suitable for building
|
#[cfg(test)]
|
||||||
/// a regular expression.
|
mod tests {
|
||||||
fn bytes_to_regex(bs: &[u8]) -> String {
|
use regex_syntax::Parser;
|
||||||
let mut s = String::with_capacity(bs.len());
|
use super::LiteralSets;
|
||||||
for &b in bs {
|
|
||||||
s.push_str(&format!("\\x{:02x}", b));
|
|
||||||
}
|
|
||||||
s
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Converts arbitrary bytes to a nice string.
|
fn sets(pattern: &str) -> LiteralSets {
|
||||||
fn show(bs: &[u8]) -> String {
|
let hir = Parser::new().parse(pattern).unwrap();
|
||||||
// Why aren't we using this to feed to the regex? Doesn't really matter
|
LiteralSets::new(&hir)
|
||||||
// I guess. ---AG
|
}
|
||||||
use std::ascii::escape_default;
|
|
||||||
use std::str;
|
fn one_regex(pattern: &str) -> Option<String> {
|
||||||
|
sets(pattern).one_regex()
|
||||||
let mut nice = String::new();
|
}
|
||||||
for &b in bs {
|
|
||||||
let part: Vec<u8> = escape_default(b).collect();
|
// Put a pattern into the same format as the one returned by `one_regex`.
|
||||||
nice.push_str(str::from_utf8(&part).unwrap());
|
fn pat(pattern: &str) -> Option<String> {
|
||||||
|
Some(format!("(?-u:{})", pattern))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn various() {
|
||||||
|
// Obviously no literals.
|
||||||
|
assert!(one_regex(r"\w").is_none());
|
||||||
|
assert!(one_regex(r"\pL").is_none());
|
||||||
|
|
||||||
|
// Tantalizingly close.
|
||||||
|
assert!(one_regex(r"\w|foo").is_none());
|
||||||
|
|
||||||
|
// There's a literal, but it's better if the regex engine handles it
|
||||||
|
// internally.
|
||||||
|
assert!(one_regex(r"abc").is_none());
|
||||||
|
|
||||||
|
// Core use cases.
|
||||||
|
assert_eq!(one_regex(r"\wabc\w"), pat("abc"));
|
||||||
|
assert_eq!(one_regex(r"abc\w"), pat("abc"));
|
||||||
|
|
||||||
|
// TODO: Make these pass. We're missing some potentially big wins
|
||||||
|
// without these.
|
||||||
|
// assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
|
||||||
|
// assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn regression_1064() {
|
||||||
|
// Regression from:
|
||||||
|
// https://github.com/BurntSushi/ripgrep/issues/1064
|
||||||
|
// assert_eq!(one_regex(r"a.*c"), pat("a"));
|
||||||
|
assert_eq!(one_regex(r"a(.*c)"), pat("a"));
|
||||||
}
|
}
|
||||||
nice
|
|
||||||
}
|
}
|
||||||
922
grep-regex/src/matcher.rs
Normal file
922
grep-regex/src/matcher.rs
Normal file
@@ -0,0 +1,922 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use grep_matcher::{
|
||||||
|
Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, ByteSet,
|
||||||
|
};
|
||||||
|
use regex::bytes::{CaptureLocations, Regex};
|
||||||
|
|
||||||
|
use config::{Config, ConfiguredHIR};
|
||||||
|
use crlf::CRLFMatcher;
|
||||||
|
use error::Error;
|
||||||
|
use word::WordMatcher;
|
||||||
|
|
||||||
|
/// A builder for constructing a `Matcher` using regular expressions.
|
||||||
|
///
|
||||||
|
/// This builder re-exports many of the same options found on the regex crate's
|
||||||
|
/// builder, in addition to a few other options such as smart case, word
|
||||||
|
/// matching and the ability to set a line terminator which may enable certain
|
||||||
|
/// types of optimizations.
|
||||||
|
///
|
||||||
|
/// The syntax supported is documented as part of the regex crate:
|
||||||
|
/// https://docs.rs/regex/*/regex/#syntax
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexMatcherBuilder {
|
||||||
|
config: Config,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for RegexMatcherBuilder {
|
||||||
|
fn default() -> RegexMatcherBuilder {
|
||||||
|
RegexMatcherBuilder::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcherBuilder {
|
||||||
|
/// Create a new builder for configuring a regex matcher.
|
||||||
|
pub fn new() -> RegexMatcherBuilder {
|
||||||
|
RegexMatcherBuilder {
|
||||||
|
config: Config::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a new matcher using the current configuration for the provided
|
||||||
|
/// pattern.
|
||||||
|
///
|
||||||
|
/// The syntax supported is documented as part of the regex crate:
|
||||||
|
/// https://docs.rs/regex/*/regex/#syntax
|
||||||
|
pub fn build(&self, pattern: &str) -> Result<RegexMatcher, Error> {
|
||||||
|
let chir = self.config.hir(pattern)?;
|
||||||
|
let fast_line_regex = chir.fast_line_regex()?;
|
||||||
|
let non_matching_bytes = chir.non_matching_bytes();
|
||||||
|
if let Some(ref re) = fast_line_regex {
|
||||||
|
trace!("extracted fast line regex: {:?}", re);
|
||||||
|
}
|
||||||
|
Ok(RegexMatcher {
|
||||||
|
config: self.config.clone(),
|
||||||
|
matcher: RegexMatcherImpl::new(&chir)?,
|
||||||
|
fast_line_regex: fast_line_regex,
|
||||||
|
non_matching_bytes: non_matching_bytes,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the value for the case insensitive (`i`) flag.
|
||||||
|
///
|
||||||
|
/// When enabled, letters in the pattern will match both upper case and
|
||||||
|
/// lower case variants.
|
||||||
|
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.case_insensitive = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable "smart case" or not.
|
||||||
|
///
|
||||||
|
/// When smart case is enabled, the builder will automatically enable
|
||||||
|
/// case insensitive matching based on how the pattern is written. Namely,
|
||||||
|
/// case insensitive mode is enabled when both of the following things
|
||||||
|
/// are true:
|
||||||
|
///
|
||||||
|
/// 1. The pattern contains at least one literal character. For example,
|
||||||
|
/// `a\w` contains a literal (`a`) but `\w` does not.
|
||||||
|
/// 2. Of the literals in the pattern, none of them are considered to be
|
||||||
|
/// uppercase according to Unicode. For example, `foo\pL` has no
|
||||||
|
/// uppercase literals but `Foo\pL` does.
|
||||||
|
pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.case_smart = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the value for the multi-line matching (`m`) flag.
|
||||||
|
///
|
||||||
|
/// When enabled, `^` matches the beginning of lines and `$` matches the
|
||||||
|
/// end of lines.
|
||||||
|
///
|
||||||
|
/// By default, they match beginning/end of the input.
|
||||||
|
pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.multi_line = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the value for the any character (`s`) flag, where in `.` matches
|
||||||
|
/// anything when `s` is set and matches anything except for new line when
|
||||||
|
/// it is not set (the default).
|
||||||
|
///
|
||||||
|
/// N.B. "matches anything" means "any byte" when Unicode is disabled and
|
||||||
|
/// means "any valid UTF-8 encoding of any Unicode scalar value" when
|
||||||
|
/// Unicode is enabled.
|
||||||
|
pub fn dot_matches_new_line(
|
||||||
|
&mut self,
|
||||||
|
yes: bool,
|
||||||
|
) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.dot_matches_new_line = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the value for the greedy swap (`U`) flag.
|
||||||
|
///
|
||||||
|
/// When enabled, a pattern like `a*` is lazy (tries to find shortest
|
||||||
|
/// match) and `a*?` is greedy (tries to find longest match).
|
||||||
|
///
|
||||||
|
/// By default, `a*` is greedy and `a*?` is lazy.
|
||||||
|
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.swap_greed = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the value for the ignore whitespace (`x`) flag.
|
||||||
|
///
|
||||||
|
/// When enabled, whitespace such as new lines and spaces will be ignored
|
||||||
|
/// between expressions of the pattern, and `#` can be used to start a
|
||||||
|
/// comment until the next new line.
|
||||||
|
pub fn ignore_whitespace(
|
||||||
|
&mut self,
|
||||||
|
yes: bool,
|
||||||
|
) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.ignore_whitespace = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the value for the Unicode (`u`) flag.
|
||||||
|
///
|
||||||
|
/// Enabled by default. When disabled, character classes such as `\w` only
|
||||||
|
/// match ASCII word characters instead of all Unicode word characters.
|
||||||
|
pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.unicode = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to support octal syntax or not.
|
||||||
|
///
|
||||||
|
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||||
|
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||||
|
/// `\141` are all equivalent regular expressions, where the last example
|
||||||
|
/// shows octal syntax.
|
||||||
|
///
|
||||||
|
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||||
|
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||||
|
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||||
|
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||||
|
/// be supported. Therefore, when octal support is disabled, the error
|
||||||
|
/// message will explicitly mention that backreferences aren't supported.
|
||||||
|
///
|
||||||
|
/// Octal syntax is disabled by default.
|
||||||
|
pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.octal = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the approximate size limit of the compiled regular expression.
|
||||||
|
///
|
||||||
|
/// This roughly corresponds to the number of bytes occupied by a single
|
||||||
|
/// compiled program. If the program exceeds this number, then a
|
||||||
|
/// compilation error is returned.
|
||||||
|
pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.size_limit = bytes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the approximate size of the cache used by the DFA.
|
||||||
|
///
|
||||||
|
/// This roughly corresponds to the number of bytes that the DFA will
|
||||||
|
/// use while searching.
|
||||||
|
///
|
||||||
|
/// Note that this is a *per thread* limit. There is no way to set a global
|
||||||
|
/// limit. In particular, if a regex is used from multiple threads
|
||||||
|
/// simultaneously, then each thread may use up to the number of bytes
|
||||||
|
/// specified here.
|
||||||
|
pub fn dfa_size_limit(
|
||||||
|
&mut self,
|
||||||
|
bytes: usize,
|
||||||
|
) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.dfa_size_limit = bytes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the nesting limit for this parser.
|
||||||
|
///
|
||||||
|
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||||
|
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||||
|
/// groups), then an error is returned by the parser.
|
||||||
|
///
|
||||||
|
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||||
|
/// overflow for consumers that do structural induction on an `Ast` using
|
||||||
|
/// explicit recursion. While this crate never does this (instead using
|
||||||
|
/// constant stack space and moving the call stack to the heap), other
|
||||||
|
/// crates may.
|
||||||
|
///
|
||||||
|
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||||
|
/// if callers want to put a limit on the amount of heap space used, then
|
||||||
|
/// they should impose a limit on the length, in bytes, of the concrete
|
||||||
|
/// pattern string. In particular, this is viable since this parser
|
||||||
|
/// implementation will limit itself to heap space proportional to the
|
||||||
|
/// lenth of the pattern string.
|
||||||
|
///
|
||||||
|
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||||
|
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||||
|
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||||
|
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||||
|
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||||
|
/// used in a granular way.
|
||||||
|
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.nest_limit = limit;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set an ASCII line terminator for the matcher.
|
||||||
|
///
|
||||||
|
/// The purpose of setting a line terminator is to enable a certain class
|
||||||
|
/// of optimizations that can make line oriented searching faster. Namely,
|
||||||
|
/// when a line terminator is enabled, then the builder will guarantee that
|
||||||
|
/// the resulting matcher will never be capable of producing a match that
|
||||||
|
/// contains the line terminator. Because of this guarantee, users of the
|
||||||
|
/// resulting matcher do not need to slowly execute a search line by line
|
||||||
|
/// for line oriented search.
|
||||||
|
///
|
||||||
|
/// If the aforementioned guarantee about not matching a line terminator
|
||||||
|
/// cannot be made because of how the pattern was written, then the builder
|
||||||
|
/// will return an error when attempting to construct the matcher. For
|
||||||
|
/// example, the pattern `a\sb` will be transformed such that it can never
|
||||||
|
/// match `a\nb` (when `\n` is the line terminator), but the pattern `a\nb`
|
||||||
|
/// will result in an error since the `\n` cannot be easily removed without
|
||||||
|
/// changing the fundamental intent of the pattern.
|
||||||
|
///
|
||||||
|
/// If the given line terminator isn't an ASCII byte (`<=127`), then the
|
||||||
|
/// builder will return an error when constructing the matcher.
|
||||||
|
pub fn line_terminator(
|
||||||
|
&mut self,
|
||||||
|
line_term: Option<u8>,
|
||||||
|
) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.line_terminator = line_term.map(LineTerminator::byte);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the line terminator to `\r\n` and enable CRLF matching for `$` in
|
||||||
|
/// regex patterns.
|
||||||
|
///
|
||||||
|
/// This method sets two distinct settings:
|
||||||
|
///
|
||||||
|
/// 1. It causes the line terminator for the matcher to be `\r\n`. Namely,
|
||||||
|
/// this prevents the matcher from ever producing a match that contains
|
||||||
|
/// a `\r` or `\n`.
|
||||||
|
/// 2. It translates all instances of `$` in the pattern to `(?:\r??$)`.
|
||||||
|
/// This works around the fact that the regex engine does not support
|
||||||
|
/// matching CRLF as a line terminator when using `$`.
|
||||||
|
///
|
||||||
|
/// In particular, because of (2), the matches produced by the matcher may
|
||||||
|
/// be slightly different than what one would expect given the pattern.
|
||||||
|
/// This is the trade off made: in many cases, `$` will "just work" in the
|
||||||
|
/// presence of `\r\n` line terminators, but matches may require some
|
||||||
|
/// trimming to faithfully represent the intended match.
|
||||||
|
///
|
||||||
|
/// Note that if you do not wish to set the line terminator but would still
|
||||||
|
/// like `$` to match `\r\n` line terminators, then it is valid to call
|
||||||
|
/// `crlf(true)` followed by `line_terminator(None)`. Ordering is
|
||||||
|
/// important, since `crlf` and `line_terminator` override each other.
|
||||||
|
pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
if yes {
|
||||||
|
self.config.line_terminator = Some(LineTerminator::crlf());
|
||||||
|
} else {
|
||||||
|
self.config.line_terminator = None;
|
||||||
|
}
|
||||||
|
self.config.crlf = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Require that all matches occur on word boundaries.
|
||||||
|
///
|
||||||
|
/// Enabling this option is subtly different than putting `\b` assertions
|
||||||
|
/// on both sides of your pattern. In particular, a `\b` assertion requires
|
||||||
|
/// that one side of it match a word character while the other match a
|
||||||
|
/// non-word character. This option, in contrast, merely requires that
|
||||||
|
/// one side match a non-word character.
|
||||||
|
///
|
||||||
|
/// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a
|
||||||
|
/// word character. However, `-2` with this `word` option enabled will
|
||||||
|
/// match the `-2` in `foo -2 bar`.
|
||||||
|
pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder {
|
||||||
|
self.config.word = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An implementation of the `Matcher` trait using Rust's standard regex
|
||||||
|
/// library.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexMatcher {
|
||||||
|
/// The configuration specified by the caller.
|
||||||
|
config: Config,
|
||||||
|
/// The underlying matcher implementation.
|
||||||
|
matcher: RegexMatcherImpl,
|
||||||
|
/// A regex that never reports false negatives but may report false
|
||||||
|
/// positives that is believed to be capable of being matched more quickly
|
||||||
|
/// than `regex`. Typically, this is a single literal or an alternation
|
||||||
|
/// of literals.
|
||||||
|
fast_line_regex: Option<Regex>,
|
||||||
|
/// A set of bytes that will never appear in a match.
|
||||||
|
non_matching_bytes: ByteSet,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcher {
|
||||||
|
/// Create a new matcher from the given pattern using the default
|
||||||
|
/// configuration.
|
||||||
|
pub fn new(pattern: &str) -> Result<RegexMatcher, Error> {
|
||||||
|
RegexMatcherBuilder::new().build(pattern)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new matcher from the given pattern using the default
|
||||||
|
/// configuration, but matches lines terminated by `\n`.
|
||||||
|
///
|
||||||
|
/// This is meant to be a convenience constructor for using a
|
||||||
|
/// `RegexMatcherBuilder` and setting its
|
||||||
|
/// [`line_terminator`](struct.RegexMatcherBuilder.html#method.line_terminator)
|
||||||
|
/// to `\n`. The purpose of using this constructor is to permit special
|
||||||
|
/// optimizations that help speed up line oriented search. These types of
|
||||||
|
/// optimizations are only appropriate when matches span no more than one
|
||||||
|
/// line. For this reason, this constructor will return an error if the
|
||||||
|
/// given pattern contains a literal `\n`. Other uses of `\n` (such as in
|
||||||
|
/// `\s`) are removed transparently.
|
||||||
|
pub fn new_line_matcher(pattern: &str) -> Result<RegexMatcher, Error> {
|
||||||
|
RegexMatcherBuilder::new()
|
||||||
|
.line_terminator(Some(b'\n'))
|
||||||
|
.build(pattern)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An encapsulation of the type of matcher we use in `RegexMatcher`.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
enum RegexMatcherImpl {
|
||||||
|
/// The standard matcher used for all regular expressions.
|
||||||
|
Standard(StandardMatcher),
|
||||||
|
/// A matcher that strips `\r` from the end of matches.
|
||||||
|
///
|
||||||
|
/// This is only used when the CRLF hack is enabled and the regex is line
|
||||||
|
/// anchored at the end.
|
||||||
|
CRLF(CRLFMatcher),
|
||||||
|
/// A matcher that only matches at word boundaries. This transforms the
|
||||||
|
/// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`.
|
||||||
|
/// Because of this, the WordMatcher provides its own implementation of
|
||||||
|
/// `Matcher` to encapsulate its use of capture groups to make them
|
||||||
|
/// invisible to the caller.
|
||||||
|
Word(WordMatcher),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcherImpl {
|
||||||
|
/// Based on the configuration, create a new implementation of the
|
||||||
|
/// `Matcher` trait.
|
||||||
|
fn new(expr: &ConfiguredHIR) -> Result<RegexMatcherImpl, Error> {
|
||||||
|
if expr.config().word {
|
||||||
|
Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?))
|
||||||
|
} else if expr.needs_crlf_stripped() {
|
||||||
|
Ok(RegexMatcherImpl::CRLF(CRLFMatcher::new(expr)?))
|
||||||
|
} else {
|
||||||
|
Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This implementation just dispatches on the internal matcher impl except
|
||||||
|
// for the line terminator optimization, which is possibly executed via
|
||||||
|
// `fast_line_regex`.
|
||||||
|
impl Matcher for RegexMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.find_at(haystack, at),
|
||||||
|
CRLF(ref m) => m.find_at(haystack, at),
|
||||||
|
Word(ref m) => m.find_at(haystack, at),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.new_captures(),
|
||||||
|
CRLF(ref m) => m.new_captures(),
|
||||||
|
Word(ref m) => m.new_captures(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.capture_count(),
|
||||||
|
CRLF(ref m) => m.capture_count(),
|
||||||
|
Word(ref m) => m.capture_count(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.capture_index(name),
|
||||||
|
CRLF(ref m) => m.capture_index(name),
|
||||||
|
Word(ref m) => m.capture_index(name),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find(&self, haystack: &[u8]) -> Result<Option<Match>, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.find(haystack),
|
||||||
|
CRLF(ref m) => m.find(haystack),
|
||||||
|
Word(ref m) => m.find(haystack),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_iter<F>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
matched: F,
|
||||||
|
) -> Result<(), NoError>
|
||||||
|
where F: FnMut(Match) -> bool
|
||||||
|
{
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.find_iter(haystack, matched),
|
||||||
|
CRLF(ref m) => m.find_iter(haystack, matched),
|
||||||
|
Word(ref m) => m.find_iter(haystack, matched),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_find_iter<F, E>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
matched: F,
|
||||||
|
) -> Result<Result<(), E>, NoError>
|
||||||
|
where F: FnMut(Match) -> Result<bool, E>
|
||||||
|
{
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.try_find_iter(haystack, matched),
|
||||||
|
CRLF(ref m) => m.try_find_iter(haystack, matched),
|
||||||
|
Word(ref m) => m.try_find_iter(haystack, matched),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.captures(haystack, caps),
|
||||||
|
CRLF(ref m) => m.captures(haystack, caps),
|
||||||
|
Word(ref m) => m.captures(haystack, caps),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_iter<F>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
matched: F,
|
||||||
|
) -> Result<(), NoError>
|
||||||
|
where F: FnMut(&RegexCaptures) -> bool
|
||||||
|
{
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.captures_iter(haystack, caps, matched),
|
||||||
|
CRLF(ref m) => m.captures_iter(haystack, caps, matched),
|
||||||
|
Word(ref m) => m.captures_iter(haystack, caps, matched),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_captures_iter<F, E>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
matched: F,
|
||||||
|
) -> Result<Result<(), E>, NoError>
|
||||||
|
where F: FnMut(&RegexCaptures) -> Result<bool, E>
|
||||||
|
{
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||||
|
CRLF(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||||
|
Word(ref m) => m.try_captures_iter(haystack, caps, matched),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.captures_at(haystack, at, caps),
|
||||||
|
CRLF(ref m) => m.captures_at(haystack, at, caps),
|
||||||
|
Word(ref m) => m.captures_at(haystack, at, caps),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn replace<F>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
dst: &mut Vec<u8>,
|
||||||
|
append: F,
|
||||||
|
) -> Result<(), NoError>
|
||||||
|
where F: FnMut(Match, &mut Vec<u8>) -> bool
|
||||||
|
{
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.replace(haystack, dst, append),
|
||||||
|
CRLF(ref m) => m.replace(haystack, dst, append),
|
||||||
|
Word(ref m) => m.replace(haystack, dst, append),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn replace_with_captures<F>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
dst: &mut Vec<u8>,
|
||||||
|
append: F,
|
||||||
|
) -> Result<(), NoError>
|
||||||
|
where F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool
|
||||||
|
{
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => {
|
||||||
|
m.replace_with_captures(haystack, caps, dst, append)
|
||||||
|
}
|
||||||
|
CRLF(ref m) => {
|
||||||
|
m.replace_with_captures(haystack, caps, dst, append)
|
||||||
|
}
|
||||||
|
Word(ref m) => {
|
||||||
|
m.replace_with_captures(haystack, caps, dst, append)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_match(&self, haystack: &[u8]) -> Result<bool, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.is_match(haystack),
|
||||||
|
CRLF(ref m) => m.is_match(haystack),
|
||||||
|
Word(ref m) => m.is_match(haystack),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_match_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.is_match_at(haystack, at),
|
||||||
|
CRLF(ref m) => m.is_match_at(haystack, at),
|
||||||
|
Word(ref m) => m.is_match_at(haystack, at),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shortest_match(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
) -> Result<Option<usize>, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.shortest_match(haystack),
|
||||||
|
CRLF(ref m) => m.shortest_match(haystack),
|
||||||
|
Word(ref m) => m.shortest_match(haystack),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shortest_match_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<usize>, NoError> {
|
||||||
|
use self::RegexMatcherImpl::*;
|
||||||
|
match self.matcher {
|
||||||
|
Standard(ref m) => m.shortest_match_at(haystack, at),
|
||||||
|
CRLF(ref m) => m.shortest_match_at(haystack, at),
|
||||||
|
Word(ref m) => m.shortest_match_at(haystack, at),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn non_matching_bytes(&self) -> Option<&ByteSet> {
|
||||||
|
Some(&self.non_matching_bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn line_terminator(&self) -> Option<LineTerminator> {
|
||||||
|
self.config.line_terminator
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_candidate_line(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
) -> Result<Option<LineMatchKind>, NoError> {
|
||||||
|
Ok(match self.fast_line_regex {
|
||||||
|
Some(ref regex) => {
|
||||||
|
regex.shortest_match(haystack).map(LineMatchKind::Candidate)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The implementation of the standard regex matcher.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
struct StandardMatcher {
|
||||||
|
/// The regular expression compiled from the pattern provided by the
|
||||||
|
/// caller.
|
||||||
|
regex: Regex,
|
||||||
|
/// A map from capture group name to its corresponding index.
|
||||||
|
names: HashMap<String, usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StandardMatcher {
|
||||||
|
fn new(expr: &ConfiguredHIR) -> Result<StandardMatcher, Error> {
|
||||||
|
let regex = expr.regex()?;
|
||||||
|
let mut names = HashMap::new();
|
||||||
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
||||||
|
if let Some(name) = optional_name {
|
||||||
|
names.insert(name.to_string(), i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(StandardMatcher { regex, names })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher for StandardMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, NoError> {
|
||||||
|
Ok(self.regex
|
||||||
|
.find_at(haystack, at)
|
||||||
|
.map(|m| Match::new(m.start(), m.end())))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
|
Ok(RegexCaptures::new(self.regex.capture_locations()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
self.regex.captures_len()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
self.names.get(name).map(|i| *i)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_find_iter<F, E>(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
mut matched: F,
|
||||||
|
) -> Result<Result<(), E>, NoError>
|
||||||
|
where F: FnMut(Match) -> Result<bool, E>
|
||||||
|
{
|
||||||
|
for m in self.regex.find_iter(haystack) {
|
||||||
|
match matched(Match::new(m.start(), m.end())) {
|
||||||
|
Ok(true) => continue,
|
||||||
|
Ok(false) => return Ok(Ok(())),
|
||||||
|
Err(err) => return Ok(Err(err)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(Ok(()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shortest_match_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<usize>, NoError> {
|
||||||
|
Ok(self.regex.shortest_match_at(haystack, at))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents the match offsets of each capturing group in a match.
|
||||||
|
///
|
||||||
|
/// The first, or `0`th capture group, always corresponds to the entire match
|
||||||
|
/// and is guaranteed to be present when a match occurs. The next capture
|
||||||
|
/// group, at index `1`, corresponds to the first capturing group in the regex,
|
||||||
|
/// ordered by the position at which the left opening parenthesis occurs.
|
||||||
|
///
|
||||||
|
/// Note that not all capturing groups are guaranteed to be present in a match.
|
||||||
|
/// For example, in the regex, `(?P<foo>\w)|(?P<bar>\W)`, only one of `foo`
|
||||||
|
/// or `bar` will ever be set in any given match.
|
||||||
|
///
|
||||||
|
/// In order to access a capture group by name, you'll need to first find the
|
||||||
|
/// index of the group using the corresponding matcher's `capture_index`
|
||||||
|
/// method, and then use that index with `RegexCaptures::get`.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexCaptures {
|
||||||
|
/// Where the locations are stored.
|
||||||
|
locs: CaptureLocations,
|
||||||
|
/// These captures behave as if the capturing groups begin at the given
|
||||||
|
/// offset. When set to `0`, this has no affect and capture groups are
|
||||||
|
/// indexed like normal.
|
||||||
|
///
|
||||||
|
/// This is useful when building matchers that wrap arbitrary regular
|
||||||
|
/// expressions. For example, `WordMatcher` takes an existing regex `re`
|
||||||
|
/// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex
|
||||||
|
/// has been wrapped from the caller. In order to do this, the matcher
|
||||||
|
/// and the capturing groups must behave as if `(re)` is the `0`th capture
|
||||||
|
/// group.
|
||||||
|
offset: usize,
|
||||||
|
/// When enable, the end of a match has `\r` stripped from it, if one
|
||||||
|
/// exists.
|
||||||
|
strip_crlf: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Captures for RegexCaptures {
|
||||||
|
fn len(&self) -> usize {
|
||||||
|
self.locs.len().checked_sub(self.offset).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get(&self, i: usize) -> Option<Match> {
|
||||||
|
if !self.strip_crlf {
|
||||||
|
let actual = i.checked_add(self.offset).unwrap();
|
||||||
|
return self.locs.pos(actual).map(|(s, e)| Match::new(s, e));
|
||||||
|
}
|
||||||
|
|
||||||
|
// currently don't support capture offsetting with CRLF stripping
|
||||||
|
assert_eq!(self.offset, 0);
|
||||||
|
let m = match self.locs.pos(i).map(|(s, e)| Match::new(s, e)) {
|
||||||
|
None => return None,
|
||||||
|
Some(m) => m,
|
||||||
|
};
|
||||||
|
// If the end position of this match corresponds to the end position
|
||||||
|
// of the overall match, then we apply our CRLF stripping. Otherwise,
|
||||||
|
// we cannot assume stripping is correct.
|
||||||
|
if i == 0 || m.end() == self.locs.pos(0).unwrap().1 {
|
||||||
|
Some(m.with_end(m.end() - 1))
|
||||||
|
} else {
|
||||||
|
Some(m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexCaptures {
|
||||||
|
pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures {
|
||||||
|
RegexCaptures::with_offset(locs, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn with_offset(
|
||||||
|
locs: CaptureLocations,
|
||||||
|
offset: usize,
|
||||||
|
) -> RegexCaptures {
|
||||||
|
RegexCaptures { locs, offset, strip_crlf: false }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn locations(&mut self) -> &mut CaptureLocations {
|
||||||
|
&mut self.locs
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn strip_crlf(&mut self, yes: bool) {
|
||||||
|
self.strip_crlf = yes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use grep_matcher::{LineMatchKind, Matcher};
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
// Test that enabling word matches does the right thing and demonstrate
|
||||||
|
// the difference between it and surrounding the regex in `\b`.
|
||||||
|
#[test]
|
||||||
|
fn word() {
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.word(true)
|
||||||
|
.build(r"-2")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc -2 foo").unwrap());
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.word(false)
|
||||||
|
.build(r"\b-2\b")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"abc -2 foo").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that enabling a line terminator prevents it from matching through
|
||||||
|
// said line terminator.
|
||||||
|
#[test]
|
||||||
|
fn line_terminator() {
|
||||||
|
// This works, because there's no line terminator specified.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.build(r"abc\sxyz")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc\nxyz").unwrap());
|
||||||
|
|
||||||
|
// This doesn't.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.line_terminator(Some(b'\n'))
|
||||||
|
.build(r"abc\sxyz")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"abc\nxyz").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure that the builder returns an error if a line terminator is set
|
||||||
|
// and the regex could not be modified to remove a line terminator.
|
||||||
|
#[test]
|
||||||
|
fn line_terminator_error() {
|
||||||
|
assert!(RegexMatcherBuilder::new()
|
||||||
|
.line_terminator(Some(b'\n'))
|
||||||
|
.build(r"a\nz")
|
||||||
|
.is_err())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that enabling CRLF permits `$` to match at the end of a line.
|
||||||
|
#[test]
|
||||||
|
fn line_terminator_crlf() {
|
||||||
|
// Test normal use of `$` with a `\n` line terminator.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.multi_line(true)
|
||||||
|
.build(r"abc$")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc\n").unwrap());
|
||||||
|
|
||||||
|
// Test that `$` doesn't match at `\r\n` boundary normally.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.multi_line(true)
|
||||||
|
.build(r"abc$")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"abc\r\n").unwrap());
|
||||||
|
|
||||||
|
// Now check the CRLF handling.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.multi_line(true)
|
||||||
|
.crlf(true)
|
||||||
|
.build(r"abc$")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"abc\r\n").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that smart case works.
|
||||||
|
#[test]
|
||||||
|
fn case_smart() {
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.case_smart(true)
|
||||||
|
.build(r"abc")
|
||||||
|
.unwrap();
|
||||||
|
assert!(matcher.is_match(b"ABC").unwrap());
|
||||||
|
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.case_smart(true)
|
||||||
|
.build(r"aBc")
|
||||||
|
.unwrap();
|
||||||
|
assert!(!matcher.is_match(b"ABC").unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that finding candidate lines works as expected.
|
||||||
|
#[test]
|
||||||
|
fn candidate_lines() {
|
||||||
|
fn is_confirmed(m: LineMatchKind) -> bool {
|
||||||
|
match m {
|
||||||
|
LineMatchKind::Confirmed(_) => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn is_candidate(m: LineMatchKind) -> bool {
|
||||||
|
match m {
|
||||||
|
LineMatchKind::Candidate(_) => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// With no line terminator set, we can't employ any optimizations,
|
||||||
|
// so we get a confirmed match.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.build(r"\wfoo\s")
|
||||||
|
.unwrap();
|
||||||
|
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
|
||||||
|
assert!(is_confirmed(m));
|
||||||
|
|
||||||
|
// With a line terminator and a regex specially crafted to have an
|
||||||
|
// easy-to-detect inner literal, we can apply an optimization that
|
||||||
|
// quickly finds candidate matches.
|
||||||
|
let matcher = RegexMatcherBuilder::new()
|
||||||
|
.line_terminator(Some(b'\n'))
|
||||||
|
.build(r"\wfoo\s")
|
||||||
|
.unwrap();
|
||||||
|
let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap();
|
||||||
|
assert!(is_candidate(m));
|
||||||
|
}
|
||||||
|
}
|
||||||
128
grep-regex/src/non_matching.rs
Normal file
128
grep-regex/src/non_matching.rs
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
use grep_matcher::ByteSet;
|
||||||
|
use regex_syntax::hir::{self, Hir, HirKind};
|
||||||
|
use utf8_ranges::Utf8Sequences;
|
||||||
|
|
||||||
|
/// Return a confirmed set of non-matching bytes from the given expression.
|
||||||
|
pub fn non_matching_bytes(expr: &Hir) -> ByteSet {
|
||||||
|
let mut set = ByteSet::full();
|
||||||
|
remove_matching_bytes(expr, &mut set);
|
||||||
|
set
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove any bytes from the given set that can occur in a matched produced by
|
||||||
|
/// the given expression.
|
||||||
|
fn remove_matching_bytes(
|
||||||
|
expr: &Hir,
|
||||||
|
set: &mut ByteSet,
|
||||||
|
) {
|
||||||
|
match *expr.kind() {
|
||||||
|
HirKind::Empty
|
||||||
|
| HirKind::Anchor(_)
|
||||||
|
| HirKind::WordBoundary(_) => {}
|
||||||
|
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||||
|
for &b in c.encode_utf8(&mut [0; 4]).as_bytes() {
|
||||||
|
set.remove(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||||
|
set.remove(b);
|
||||||
|
}
|
||||||
|
HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
||||||
|
for range in cls.iter() {
|
||||||
|
// This is presumably faster than encoding every codepoint
|
||||||
|
// to UTF-8 and then removing those bytes from the set.
|
||||||
|
for seq in Utf8Sequences::new(range.start(), range.end()) {
|
||||||
|
for byte_range in seq.as_slice() {
|
||||||
|
set.remove_all(byte_range.start, byte_range.end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
||||||
|
for range in cls.iter() {
|
||||||
|
set.remove_all(range.start(), range.end());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HirKind::Repetition(ref x) => {
|
||||||
|
remove_matching_bytes(&x.hir, set);
|
||||||
|
}
|
||||||
|
HirKind::Group(ref x) => {
|
||||||
|
remove_matching_bytes(&x.hir, set);
|
||||||
|
}
|
||||||
|
HirKind::Concat(ref xs) => {
|
||||||
|
for x in xs {
|
||||||
|
remove_matching_bytes(x, set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HirKind::Alternation(ref xs) => {
|
||||||
|
for x in xs {
|
||||||
|
remove_matching_bytes(x, set);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use grep_matcher::ByteSet;
|
||||||
|
use regex_syntax::ParserBuilder;
|
||||||
|
|
||||||
|
use super::non_matching_bytes;
|
||||||
|
|
||||||
|
fn extract(pattern: &str) -> ByteSet {
|
||||||
|
let expr = ParserBuilder::new()
|
||||||
|
.allow_invalid_utf8(true)
|
||||||
|
.build()
|
||||||
|
.parse(pattern)
|
||||||
|
.unwrap();
|
||||||
|
non_matching_bytes(&expr)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sparse(set: &ByteSet) -> Vec<u8> {
|
||||||
|
let mut sparse_set = vec![];
|
||||||
|
for b in (0..256).map(|b| b as u8) {
|
||||||
|
if set.contains(b) {
|
||||||
|
sparse_set.push(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sparse_set
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sparse_except(except: &[u8]) -> Vec<u8> {
|
||||||
|
let mut except_set = vec![false; 256];
|
||||||
|
for &b in except {
|
||||||
|
except_set[b as usize] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut set = vec![];
|
||||||
|
for b in (0..256).map(|b| b as u8) {
|
||||||
|
if !except_set[b as usize] {
|
||||||
|
set.push(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
set
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn dot() {
|
||||||
|
assert_eq!(sparse(&extract(".")), vec![
|
||||||
|
b'\n',
|
||||||
|
192, 193, 245, 246, 247, 248, 249,
|
||||||
|
250, 251, 252, 253, 254, 255,
|
||||||
|
]);
|
||||||
|
assert_eq!(sparse(&extract("(?s).")), vec![
|
||||||
|
192, 193, 245, 246, 247, 248, 249,
|
||||||
|
250, 251, 252, 253, 254, 255,
|
||||||
|
]);
|
||||||
|
assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']);
|
||||||
|
assert_eq!(sparse(&extract("(?s-u).")), vec![]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn literal() {
|
||||||
|
assert_eq!(sparse(&extract("a")), sparse_except(&[b'a']));
|
||||||
|
assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83]));
|
||||||
|
assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF]));
|
||||||
|
assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF]));
|
||||||
|
}
|
||||||
|
}
|
||||||
154
grep-regex/src/strip.rs
Normal file
154
grep-regex/src/strip.rs
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
use grep_matcher::LineTerminator;
|
||||||
|
use regex_syntax::hir::{self, Hir, HirKind};
|
||||||
|
|
||||||
|
use error::{Error, ErrorKind};
|
||||||
|
|
||||||
|
/// Return an HIR that is guaranteed to never match the given line terminator,
|
||||||
|
/// if possible.
|
||||||
|
///
|
||||||
|
/// If the transformation isn't possible, then an error is returned.
|
||||||
|
///
|
||||||
|
/// In general, if a literal line terminator occurs anywhere in the HIR, then
|
||||||
|
/// this will return an error. However, if the line terminator occurs within
|
||||||
|
/// a character class with at least one other character (that isn't also a line
|
||||||
|
/// terminator), then the line terminator is simply stripped from that class.
|
||||||
|
///
|
||||||
|
/// If the given line terminator is not ASCII, then this function returns an
|
||||||
|
/// error.
|
||||||
|
pub fn strip_from_match(
|
||||||
|
expr: Hir,
|
||||||
|
line_term: LineTerminator,
|
||||||
|
) -> Result<Hir, Error> {
|
||||||
|
if line_term.is_crlf() {
|
||||||
|
let expr1 = strip_from_match_ascii(expr, b'\r')?;
|
||||||
|
strip_from_match_ascii(expr1, b'\n')
|
||||||
|
} else {
|
||||||
|
let b = line_term.as_byte();
|
||||||
|
if b > 0x7F {
|
||||||
|
return Err(Error::new(ErrorKind::InvalidLineTerminator(b)));
|
||||||
|
}
|
||||||
|
strip_from_match_ascii(expr, b)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The implementation of strip_from_match. The given byte must be ASCII. This
|
||||||
|
/// function panics otherwise.
|
||||||
|
fn strip_from_match_ascii(
|
||||||
|
expr: Hir,
|
||||||
|
byte: u8,
|
||||||
|
) -> Result<Hir, Error> {
|
||||||
|
assert!(byte <= 0x7F);
|
||||||
|
let chr = byte as char;
|
||||||
|
assert_eq!(chr.len_utf8(), 1);
|
||||||
|
|
||||||
|
let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string())));
|
||||||
|
|
||||||
|
Ok(match expr.into_kind() {
|
||||||
|
HirKind::Empty => Hir::empty(),
|
||||||
|
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
||||||
|
if c == chr {
|
||||||
|
return invalid();
|
||||||
|
}
|
||||||
|
Hir::literal(hir::Literal::Unicode(c))
|
||||||
|
}
|
||||||
|
HirKind::Literal(hir::Literal::Byte(b)) => {
|
||||||
|
if b as char == chr {
|
||||||
|
return invalid();
|
||||||
|
}
|
||||||
|
Hir::literal(hir::Literal::Byte(b))
|
||||||
|
}
|
||||||
|
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
||||||
|
let remove = hir::ClassUnicode::new(Some(
|
||||||
|
hir::ClassUnicodeRange::new(chr, chr),
|
||||||
|
));
|
||||||
|
cls.difference(&remove);
|
||||||
|
if cls.ranges().is_empty() {
|
||||||
|
return invalid();
|
||||||
|
}
|
||||||
|
Hir::class(hir::Class::Unicode(cls))
|
||||||
|
}
|
||||||
|
HirKind::Class(hir::Class::Bytes(mut cls)) => {
|
||||||
|
let remove = hir::ClassBytes::new(Some(
|
||||||
|
hir::ClassBytesRange::new(byte, byte),
|
||||||
|
));
|
||||||
|
cls.difference(&remove);
|
||||||
|
if cls.ranges().is_empty() {
|
||||||
|
return invalid();
|
||||||
|
}
|
||||||
|
Hir::class(hir::Class::Bytes(cls))
|
||||||
|
}
|
||||||
|
HirKind::Anchor(x) => Hir::anchor(x),
|
||||||
|
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
||||||
|
HirKind::Repetition(mut x) => {
|
||||||
|
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
|
||||||
|
Hir::repetition(x)
|
||||||
|
}
|
||||||
|
HirKind::Group(mut x) => {
|
||||||
|
x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?);
|
||||||
|
Hir::group(x)
|
||||||
|
}
|
||||||
|
HirKind::Concat(xs) => {
|
||||||
|
let xs = xs.into_iter()
|
||||||
|
.map(|e| strip_from_match_ascii(e, byte))
|
||||||
|
.collect::<Result<Vec<Hir>, Error>>()?;
|
||||||
|
Hir::concat(xs)
|
||||||
|
}
|
||||||
|
HirKind::Alternation(xs) => {
|
||||||
|
let xs = xs.into_iter()
|
||||||
|
.map(|e| strip_from_match_ascii(e, byte))
|
||||||
|
.collect::<Result<Vec<Hir>, Error>>()?;
|
||||||
|
Hir::alternation(xs)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use regex_syntax::Parser;
|
||||||
|
|
||||||
|
use error::Error;
|
||||||
|
use super::{LineTerminator, strip_from_match};
|
||||||
|
|
||||||
|
fn roundtrip(pattern: &str, byte: u8) -> String {
|
||||||
|
roundtrip_line_term(pattern, LineTerminator::byte(byte)).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn roundtrip_crlf(pattern: &str) -> String {
|
||||||
|
roundtrip_line_term(pattern, LineTerminator::crlf()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn roundtrip_err(pattern: &str, byte: u8) -> Result<String, Error> {
|
||||||
|
roundtrip_line_term(pattern, LineTerminator::byte(byte))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn roundtrip_line_term(
|
||||||
|
pattern: &str,
|
||||||
|
line_term: LineTerminator,
|
||||||
|
) -> Result<String, Error> {
|
||||||
|
let expr1 = Parser::new().parse(pattern).unwrap();
|
||||||
|
let expr2 = strip_from_match(expr1, line_term)?;
|
||||||
|
Ok(expr2.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn various() {
|
||||||
|
assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]");
|
||||||
|
assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]");
|
||||||
|
assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]");
|
||||||
|
assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]");
|
||||||
|
assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]");
|
||||||
|
|
||||||
|
assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])");
|
||||||
|
assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])");
|
||||||
|
|
||||||
|
assert!(roundtrip_err(r"\n", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"abc\n", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"\nabc", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"abc\nxyz", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"\x0A", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"\u000A", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"\U0000000A", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err(r"\u{A}", b'\n').is_err());
|
||||||
|
assert!(roundtrip_err("\n", b'\n').is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
29
grep-regex/src/util.rs
Normal file
29
grep-regex/src/util.rs
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
/// Converts an arbitrary sequence of bytes to a literal suitable for building
|
||||||
|
/// a regular expression.
|
||||||
|
pub fn bytes_to_regex(bs: &[u8]) -> String {
|
||||||
|
use std::fmt::Write;
|
||||||
|
use regex_syntax::is_meta_character;
|
||||||
|
|
||||||
|
let mut s = String::with_capacity(bs.len());
|
||||||
|
for &b in bs {
|
||||||
|
if b <= 0x7F && !is_meta_character(b as char) {
|
||||||
|
write!(s, r"{}", b as char).unwrap();
|
||||||
|
} else {
|
||||||
|
write!(s, r"\x{:02x}", b).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts arbitrary bytes to a nice string.
|
||||||
|
pub fn show_bytes(bs: &[u8]) -> String {
|
||||||
|
use std::ascii::escape_default;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
let mut nice = String::new();
|
||||||
|
for &b in bs {
|
||||||
|
let part: Vec<u8> = escape_default(b).collect();
|
||||||
|
nice.push_str(str::from_utf8(&part).unwrap());
|
||||||
|
}
|
||||||
|
nice
|
||||||
|
}
|
||||||
196
grep-regex/src/word.rs
Normal file
196
grep-regex/src/word.rs
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::cell::RefCell;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use grep_matcher::{Match, Matcher, NoError};
|
||||||
|
use regex::bytes::{CaptureLocations, Regex};
|
||||||
|
use thread_local::CachedThreadLocal;
|
||||||
|
|
||||||
|
use config::ConfiguredHIR;
|
||||||
|
use error::Error;
|
||||||
|
use matcher::RegexCaptures;
|
||||||
|
|
||||||
|
/// A matcher for implementing "word match" semantics.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct WordMatcher {
|
||||||
|
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
|
||||||
|
regex: Regex,
|
||||||
|
/// A map from capture group name to capture group index.
|
||||||
|
names: HashMap<String, usize>,
|
||||||
|
/// A reusable buffer for finding the match location of the inner group.
|
||||||
|
locs: Arc<CachedThreadLocal<RefCell<CaptureLocations>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for WordMatcher {
|
||||||
|
fn clone(&self) -> WordMatcher {
|
||||||
|
// We implement Clone manually so that we get a fresh CachedThreadLocal
|
||||||
|
// such that it can set its own thread owner. This permits each thread
|
||||||
|
// usings `locs` to hit the fast path.
|
||||||
|
WordMatcher {
|
||||||
|
regex: self.regex.clone(),
|
||||||
|
names: self.names.clone(),
|
||||||
|
locs: Arc::new(CachedThreadLocal::new()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl WordMatcher {
|
||||||
|
/// Create a new matcher from the given pattern that only produces matches
|
||||||
|
/// that are considered "words."
|
||||||
|
///
|
||||||
|
/// The given options are used to construct the regular expression
|
||||||
|
/// internally.
|
||||||
|
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
|
||||||
|
let word_expr = expr.with_pattern(|pat| {
|
||||||
|
format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat)
|
||||||
|
})?;
|
||||||
|
let regex = word_expr.regex()?;
|
||||||
|
let locs = Arc::new(CachedThreadLocal::new());
|
||||||
|
|
||||||
|
let mut names = HashMap::new();
|
||||||
|
for (i, optional_name) in regex.capture_names().enumerate() {
|
||||||
|
if let Some(name) = optional_name {
|
||||||
|
names.insert(name.to_string(), i.checked_sub(1).unwrap());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(WordMatcher { regex, names, locs })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher for WordMatcher {
|
||||||
|
type Captures = RegexCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, NoError> {
|
||||||
|
// To make this easy to get right, we extract captures here instead of
|
||||||
|
// calling `find_at`. The actual match is at capture group `1` instead
|
||||||
|
// of `0`. We *could* use `find_at` here and then trim the match after
|
||||||
|
// the fact, but that's a bit harder to get right, and it's not clear
|
||||||
|
// if it's worth it.
|
||||||
|
|
||||||
|
let cell = self.locs.get_or(|| {
|
||||||
|
Box::new(RefCell::new(self.regex.capture_locations()))
|
||||||
|
});
|
||||||
|
let mut caps = cell.borrow_mut();
|
||||||
|
self.regex.captures_read_at(&mut caps, haystack, at);
|
||||||
|
Ok(caps.get(1).map(|m| Match::new(m.0, m.1)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<RegexCaptures, NoError> {
|
||||||
|
Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_count(&self) -> usize {
|
||||||
|
self.regex.captures_len().checked_sub(1).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn capture_index(&self, name: &str) -> Option<usize> {
|
||||||
|
self.names.get(name).map(|i| *i)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn captures_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
caps: &mut RegexCaptures,
|
||||||
|
) -> Result<bool, NoError> {
|
||||||
|
let r = self.regex.captures_read_at(caps.locations(), haystack, at);
|
||||||
|
Ok(r.is_some())
|
||||||
|
}
|
||||||
|
|
||||||
|
// We specifically do not implement other methods like find_iter or
|
||||||
|
// captures_iter. Namely, the iter methods are guaranteed to be correct
|
||||||
|
// by virtue of implementing find_at and captures_at above.
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use grep_matcher::{Captures, Match, Matcher};
|
||||||
|
use config::Config;
|
||||||
|
use super::WordMatcher;
|
||||||
|
|
||||||
|
fn matcher(pattern: &str) -> WordMatcher {
|
||||||
|
let chir = Config::default().hir(pattern).unwrap();
|
||||||
|
WordMatcher::new(&chir).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
||||||
|
matcher(pattern)
|
||||||
|
.find(haystack.as_bytes())
|
||||||
|
.unwrap()
|
||||||
|
.map(|m| (m.start(), m.end()))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> {
|
||||||
|
let m = matcher(pattern);
|
||||||
|
let mut caps = m.new_captures().unwrap();
|
||||||
|
if !m.captures(haystack.as_bytes(), &mut caps).unwrap() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
caps.get(0).map(|m| (m.start(), m.end()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that the standard `find` API reports offsets correctly.
|
||||||
|
#[test]
|
||||||
|
fn various_find() {
|
||||||
|
assert_eq!(Some((0, 3)), find(r"foo", "foo"));
|
||||||
|
assert_eq!(Some((0, 3)), find(r"foo", "foo("));
|
||||||
|
assert_eq!(Some((1, 4)), find(r"foo", "!foo("));
|
||||||
|
assert_eq!(None, find(r"foo", "!afoo("));
|
||||||
|
|
||||||
|
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
|
||||||
|
assert_eq!(None, find(r"foo", "fooб"));
|
||||||
|
// assert_eq!(Some((0, 3)), find(r"foo", "fooб"));
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
||||||
|
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that the captures API also reports offsets correctly, just as
|
||||||
|
// find does. This exercises a different path in the code since captures
|
||||||
|
// are handled differently.
|
||||||
|
#[test]
|
||||||
|
fn various_captures() {
|
||||||
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo"));
|
||||||
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo("));
|
||||||
|
assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo("));
|
||||||
|
assert_eq!(None, find_by_caps(r"foo", "!afoo("));
|
||||||
|
|
||||||
|
assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃"));
|
||||||
|
assert_eq!(None, find_by_caps(r"foo", "fooб"));
|
||||||
|
// assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб"));
|
||||||
|
|
||||||
|
// See: https://github.com/BurntSushi/ripgrep/issues/389
|
||||||
|
assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test that the capture reporting methods work as advertised.
|
||||||
|
#[test]
|
||||||
|
fn capture_indexing() {
|
||||||
|
let m = matcher(r"(a)(?P<foo>b)(c)");
|
||||||
|
assert_eq!(4, m.capture_count());
|
||||||
|
assert_eq!(Some(2), m.capture_index("foo"));
|
||||||
|
|
||||||
|
let mut caps = m.new_captures().unwrap();
|
||||||
|
assert_eq!(4, caps.len());
|
||||||
|
|
||||||
|
assert!(m.captures(b"abc", &mut caps).unwrap());
|
||||||
|
assert_eq!(caps.get(0), Some(Match::new(0, 3)));
|
||||||
|
assert_eq!(caps.get(1), Some(Match::new(0, 1)));
|
||||||
|
assert_eq!(caps.get(2), Some(Match::new(1, 2)));
|
||||||
|
assert_eq!(caps.get(3), Some(Match::new(2, 3)));
|
||||||
|
assert_eq!(caps.get(4), None);
|
||||||
|
|
||||||
|
assert!(m.captures(b"#abc#", &mut caps).unwrap());
|
||||||
|
assert_eq!(caps.get(0), Some(Match::new(1, 4)));
|
||||||
|
assert_eq!(caps.get(1), Some(Match::new(1, 2)));
|
||||||
|
assert_eq!(caps.get(2), Some(Match::new(2, 3)));
|
||||||
|
assert_eq!(caps.get(3), Some(Match::new(3, 4)));
|
||||||
|
assert_eq!(caps.get(4), None);
|
||||||
|
}
|
||||||
|
}
|
||||||
33
grep-searcher/Cargo.toml
Normal file
33
grep-searcher/Cargo.toml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
[package]
|
||||||
|
name = "grep-searcher"
|
||||||
|
version = "0.1.3" #:version
|
||||||
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
|
description = """
|
||||||
|
Fast line oriented regex searching as a library.
|
||||||
|
"""
|
||||||
|
documentation = "https://docs.rs/grep-searcher"
|
||||||
|
homepage = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
repository = "https://github.com/BurntSushi/ripgrep"
|
||||||
|
readme = "README.md"
|
||||||
|
keywords = ["regex", "grep", "egrep", "search", "pattern"]
|
||||||
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
bytecount = "0.5"
|
||||||
|
encoding_rs = "0.8.14"
|
||||||
|
encoding_rs_io = "0.1.4"
|
||||||
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
|
log = "0.4.5"
|
||||||
|
memchr = "2.1"
|
||||||
|
memmap = "0.7"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
grep-regex = { version = "0.1.1", path = "../grep-regex" }
|
||||||
|
regex = "1.1"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
default = ["bytecount/runtime-dispatch-simd"]
|
||||||
|
simd-accel = ["encoding_rs/simd-accel"]
|
||||||
|
|
||||||
|
# This feature is DEPRECATED. Runtime dispatch is used for SIMD now.
|
||||||
|
avx-accel = []
|
||||||
21
grep-searcher/LICENSE-MIT
Normal file
21
grep-searcher/LICENSE-MIT
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
The MIT License (MIT)
|
||||||
|
|
||||||
|
Copyright (c) 2015 Andrew Gallant
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
37
grep-searcher/README.md
Normal file
37
grep-searcher/README.md
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
grep-searcher
|
||||||
|
-------------
|
||||||
|
A high level library for executing fast line oriented searches. This handles
|
||||||
|
things like reporting contextual lines, counting lines, inverting a search,
|
||||||
|
detecting binary data, automatic UTF-16 transcoding and deciding whether or not
|
||||||
|
to use memory maps.
|
||||||
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep-searcher)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep-searcher](https://docs.rs/grep-searcher)
|
||||||
|
|
||||||
|
**NOTE:** You probably don't want to use this crate directly. Instead, you
|
||||||
|
should prefer the facade defined in the
|
||||||
|
[`grep`](https://docs.rs/grep)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep-searcher = "0.1"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep_searcher;
|
||||||
|
```
|
||||||
24
grep-searcher/UNLICENSE
Normal file
24
grep-searcher/UNLICENSE
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
This is free and unencumbered software released into the public domain.
|
||||||
|
|
||||||
|
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||||
|
distribute this software, either in source code form or as a compiled
|
||||||
|
binary, for any purpose, commercial or non-commercial, and by any
|
||||||
|
means.
|
||||||
|
|
||||||
|
In jurisdictions that recognize copyright laws, the author or authors
|
||||||
|
of this software dedicate any and all copyright interest in the
|
||||||
|
software to the public domain. We make this dedication for the benefit
|
||||||
|
of the public at large and to the detriment of our heirs and
|
||||||
|
successors. We intend this dedication to be an overt act of
|
||||||
|
relinquishment in perpetuity of all present and future rights to this
|
||||||
|
software under copyright law.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||||
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||||
|
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||||
|
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||||
|
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||||
|
OTHER DEALINGS IN THE SOFTWARE.
|
||||||
|
|
||||||
|
For more information, please refer to <http://unlicense.org/>
|
||||||
33
grep-searcher/examples/search-stdin.rs
Normal file
33
grep-searcher/examples/search-stdin.rs
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
extern crate grep_regex;
|
||||||
|
extern crate grep_searcher;
|
||||||
|
|
||||||
|
use std::env;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::io;
|
||||||
|
use std::process;
|
||||||
|
|
||||||
|
use grep_regex::RegexMatcher;
|
||||||
|
use grep_searcher::Searcher;
|
||||||
|
use grep_searcher::sinks::UTF8;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
if let Err(err) = example() {
|
||||||
|
eprintln!("{}", err);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn example() -> Result<(), Box<Error>> {
|
||||||
|
let pattern = match env::args().nth(1) {
|
||||||
|
Some(pattern) => pattern,
|
||||||
|
None => return Err(From::from(format!(
|
||||||
|
"Usage: search-stdin <pattern>"
|
||||||
|
))),
|
||||||
|
};
|
||||||
|
let matcher = RegexMatcher::new(&pattern)?;
|
||||||
|
Searcher::new().search_reader(&matcher, io::stdin(), UTF8(|lnum, line| {
|
||||||
|
print!("{}:{}", lnum, line);
|
||||||
|
Ok(true)
|
||||||
|
}))?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
132
grep-searcher/src/lib.rs
Normal file
132
grep-searcher/src/lib.rs
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
/*!
|
||||||
|
This crate provides an implementation of line oriented search, with optional
|
||||||
|
support for multi-line search.
|
||||||
|
|
||||||
|
# Brief overview
|
||||||
|
|
||||||
|
The principle type in this crate is a
|
||||||
|
[`Searcher`](struct.Searcher.html),
|
||||||
|
which can be configured and built by a
|
||||||
|
[`SearcherBuilder`](struct.SearcherBuilder.html).
|
||||||
|
A `Searcher` is responsible for reading bytes from a source (e.g., a file),
|
||||||
|
executing a search of those bytes using a `Matcher` (e.g., a regex) and then
|
||||||
|
reporting the results of that search to a
|
||||||
|
[`Sink`](trait.Sink.html)
|
||||||
|
(e.g., stdout). The `Searcher` itself is principally responsible for managing
|
||||||
|
the consumption of bytes from a source and applying a `Matcher` over those
|
||||||
|
bytes in an efficient way. The `Searcher` is also responsible for inverting
|
||||||
|
a search, counting lines, reporting contextual lines, detecting binary data
|
||||||
|
and even deciding whether or not to use memory maps.
|
||||||
|
|
||||||
|
A `Matcher` (which is defined in the
|
||||||
|
[`grep-matcher`](https://crates.io/crates/grep-matcher)
|
||||||
|
crate) is a trait for describing the lowest levels of pattern search in a
|
||||||
|
generic way. The interface itself is very similar to the interface of a regular
|
||||||
|
expression. For example, the
|
||||||
|
[`grep-regex`](https://crates.io/crates/grep-regex)
|
||||||
|
crate provides an implementation of the `Matcher` trait using Rust's
|
||||||
|
[`regex`](https://crates.io/crates/regex)
|
||||||
|
crate.
|
||||||
|
|
||||||
|
Finally, a `Sink` describes how callers receive search results producer by a
|
||||||
|
`Searcher`. This includes routines that are called at the beginning and end of
|
||||||
|
a search, in addition to routines that are called when matching or contextual
|
||||||
|
lines are found by the `Searcher`. Implementations of `Sink` can be trivially
|
||||||
|
simple, or extraordinarily complex, such as the
|
||||||
|
`Standard` printer found in the
|
||||||
|
[`grep-printer`](https://crates.io/crates/grep-printer)
|
||||||
|
crate, which effectively implements grep-like output.
|
||||||
|
This crate also provides convenience `Sink` implementations in the
|
||||||
|
[`sinks`](sinks/index.html)
|
||||||
|
sub-module for easy searching with closures.
|
||||||
|
|
||||||
|
# Example
|
||||||
|
|
||||||
|
This example shows how to execute the searcher and read the search results
|
||||||
|
using the
|
||||||
|
[`UTF8`](sinks/struct.UTF8.html)
|
||||||
|
implementation of `Sink`.
|
||||||
|
|
||||||
|
```
|
||||||
|
extern crate grep_matcher;
|
||||||
|
extern crate grep_regex;
|
||||||
|
extern crate grep_searcher;
|
||||||
|
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
use grep_matcher::Matcher;
|
||||||
|
use grep_regex::RegexMatcher;
|
||||||
|
use grep_searcher::Searcher;
|
||||||
|
use grep_searcher::sinks::UTF8;
|
||||||
|
|
||||||
|
const SHERLOCK: &'static [u8] = b"\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, success in the province of detective work must always
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
and exhibited clearly, with a label attached.
|
||||||
|
";
|
||||||
|
|
||||||
|
# fn main() { example().unwrap() }
|
||||||
|
fn example() -> Result<(), Box<Error>> {
|
||||||
|
let matcher = RegexMatcher::new(r"Doctor \w+")?;
|
||||||
|
let mut matches: Vec<(u64, String)> = vec![];
|
||||||
|
Searcher::new().search_slice(&matcher, SHERLOCK, UTF8(|lnum, line| {
|
||||||
|
// We are guaranteed to find a match, so the unwrap is OK.
|
||||||
|
let mymatch = matcher.find(line.as_bytes())?.unwrap();
|
||||||
|
matches.push((lnum, line[mymatch].to_string()));
|
||||||
|
Ok(true)
|
||||||
|
}))?;
|
||||||
|
|
||||||
|
assert_eq!(matches.len(), 2);
|
||||||
|
assert_eq!(
|
||||||
|
matches[0],
|
||||||
|
(1, "Doctor Watsons".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
matches[1],
|
||||||
|
(5, "Doctor Watson".to_string())
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
See also `examples/search-stdin.rs` from the root of this crate's directory
|
||||||
|
to see a similar example that accepts a pattern on the command line and
|
||||||
|
searches stdin.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#![deny(missing_docs)]
|
||||||
|
|
||||||
|
extern crate bytecount;
|
||||||
|
extern crate encoding_rs;
|
||||||
|
extern crate encoding_rs_io;
|
||||||
|
extern crate grep_matcher;
|
||||||
|
#[macro_use]
|
||||||
|
extern crate log;
|
||||||
|
extern crate memchr;
|
||||||
|
extern crate memmap;
|
||||||
|
#[cfg(test)]
|
||||||
|
extern crate regex;
|
||||||
|
|
||||||
|
pub use lines::{LineIter, LineStep};
|
||||||
|
pub use searcher::{
|
||||||
|
BinaryDetection, ConfigError, Encoding, MmapChoice,
|
||||||
|
Searcher, SearcherBuilder,
|
||||||
|
};
|
||||||
|
pub use sink::{
|
||||||
|
Sink, SinkError,
|
||||||
|
SinkContext, SinkContextKind, SinkFinish, SinkMatch,
|
||||||
|
};
|
||||||
|
pub use sink::sinks;
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
|
mod macros;
|
||||||
|
|
||||||
|
mod line_buffer;
|
||||||
|
mod lines;
|
||||||
|
mod searcher;
|
||||||
|
mod sink;
|
||||||
|
#[cfg(test)]
|
||||||
|
mod testutil;
|
||||||
968
grep-searcher/src/line_buffer.rs
Normal file
968
grep-searcher/src/line_buffer.rs
Normal file
@@ -0,0 +1,968 @@
|
|||||||
|
use std::cmp;
|
||||||
|
use std::io;
|
||||||
|
use std::ptr;
|
||||||
|
|
||||||
|
use memchr::{memchr, memrchr};
|
||||||
|
|
||||||
|
/// The default buffer capacity that we use for the line buffer.
|
||||||
|
pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB
|
||||||
|
|
||||||
|
/// The behavior of a searcher in the face of long lines and big contexts.
|
||||||
|
///
|
||||||
|
/// When searching data incrementally using a fixed size buffer, this controls
|
||||||
|
/// the amount of *additional* memory to allocate beyond the size of the buffer
|
||||||
|
/// to accommodate lines (which may include the lines in a context window, when
|
||||||
|
/// enabled) that do not fit in the buffer.
|
||||||
|
///
|
||||||
|
/// The default is to eagerly allocate without a limit.
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
pub enum BufferAllocation {
|
||||||
|
/// Attempt to expand the size of the buffer until either at least the next
|
||||||
|
/// line fits into memory or until all available memory is exhausted.
|
||||||
|
///
|
||||||
|
/// This is the default.
|
||||||
|
Eager,
|
||||||
|
/// Limit the amount of additional memory allocated to the given size. If
|
||||||
|
/// a line is found that requires more memory than is allowed here, then
|
||||||
|
/// stop reading and return an error.
|
||||||
|
Error(usize),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for BufferAllocation {
|
||||||
|
fn default() -> BufferAllocation {
|
||||||
|
BufferAllocation::Eager
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new error to be used when a configured allocation limit has been
|
||||||
|
/// reached.
|
||||||
|
pub fn alloc_error(limit: usize) -> io::Error {
|
||||||
|
let msg = format!("configured allocation limit ({}) exceeded", limit);
|
||||||
|
io::Error::new(io::ErrorKind::Other, msg)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The behavior of binary detection in the line buffer.
|
||||||
|
///
|
||||||
|
/// Binary detection is the process of _heuristically_ identifying whether a
|
||||||
|
/// given chunk of data is binary or not, and then taking an action based on
|
||||||
|
/// the result of that heuristic. The motivation behind detecting binary data
|
||||||
|
/// is that binary data often indicates data that is undesirable to search
|
||||||
|
/// using textual patterns. Of course, there are many cases in which this isn't
|
||||||
|
/// true, which is why binary detection is disabled by default.
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
pub enum BinaryDetection {
|
||||||
|
/// No binary detection is performed. Data reported by the line buffer may
|
||||||
|
/// contain arbitrary bytes.
|
||||||
|
None,
|
||||||
|
/// The given byte is searched in all contents read by the line buffer. If
|
||||||
|
/// it occurs, then the data is considered binary and the line buffer acts
|
||||||
|
/// as if it reached EOF. The line buffer guarantees that this byte will
|
||||||
|
/// never be observable by callers.
|
||||||
|
Quit(u8),
|
||||||
|
/// The given byte is searched in all contents read by the line buffer. If
|
||||||
|
/// it occurs, then it is replaced by the line terminator. The line buffer
|
||||||
|
/// guarantees that this byte will never be observable by callers.
|
||||||
|
Convert(u8),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for BinaryDetection {
|
||||||
|
fn default() -> BinaryDetection {
|
||||||
|
BinaryDetection::None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BinaryDetection {
|
||||||
|
/// Returns true if and only if the detection heuristic demands that
|
||||||
|
/// the line buffer stop read data once binary data is observed.
|
||||||
|
fn is_quit(&self) -> bool {
|
||||||
|
match *self {
|
||||||
|
BinaryDetection::Quit(_) => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The configuration of a buffer. This contains options that are fixed once
|
||||||
|
/// a buffer has been constructed.
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
struct Config {
|
||||||
|
/// The number of bytes to attempt to read at a time.
|
||||||
|
capacity: usize,
|
||||||
|
/// The line terminator.
|
||||||
|
lineterm: u8,
|
||||||
|
/// The behavior for handling long lines.
|
||||||
|
buffer_alloc: BufferAllocation,
|
||||||
|
/// When set, the presence of the given byte indicates binary content.
|
||||||
|
binary: BinaryDetection,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Config {
|
||||||
|
fn default() -> Config {
|
||||||
|
Config {
|
||||||
|
capacity: DEFAULT_BUFFER_CAPACITY,
|
||||||
|
lineterm: b'\n',
|
||||||
|
buffer_alloc: BufferAllocation::default(),
|
||||||
|
binary: BinaryDetection::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A builder for constructing line buffers.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct LineBufferBuilder {
|
||||||
|
config: Config,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LineBufferBuilder {
|
||||||
|
/// Create a new builder for a buffer.
|
||||||
|
pub fn new() -> LineBufferBuilder {
|
||||||
|
LineBufferBuilder { config: Config::default() }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new line buffer from this builder's configuration.
|
||||||
|
pub fn build(&self) -> LineBuffer {
|
||||||
|
LineBuffer {
|
||||||
|
config: self.config,
|
||||||
|
buf: vec![0; self.config.capacity],
|
||||||
|
pos: 0,
|
||||||
|
last_lineterm: 0,
|
||||||
|
end: 0,
|
||||||
|
absolute_byte_offset: 0,
|
||||||
|
binary_byte_offset: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the default capacity to use for a buffer.
|
||||||
|
///
|
||||||
|
/// In general, the capacity of a buffer corresponds to the amount of data
|
||||||
|
/// to hold in memory, and the size of the reads to make to the underlying
|
||||||
|
/// reader.
|
||||||
|
///
|
||||||
|
/// This is set to a reasonable default and probably shouldn't be changed
|
||||||
|
/// unless there's a specific reason to do so.
|
||||||
|
pub fn capacity(&mut self, capacity: usize) -> &mut LineBufferBuilder {
|
||||||
|
self.config.capacity = capacity;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the line terminator for the buffer.
|
||||||
|
///
|
||||||
|
/// Every buffer has a line terminator, and this line terminator is used
|
||||||
|
/// to determine how to roll the buffer forward. For example, when a read
|
||||||
|
/// to the buffer's underlying reader occurs, the end of the data that is
|
||||||
|
/// read is likely to correspond to an incomplete line. As a line buffer,
|
||||||
|
/// callers should not access this data since it is incomplete. The line
|
||||||
|
/// terminator is how the line buffer determines the part of the read that
|
||||||
|
/// is incomplete.
|
||||||
|
///
|
||||||
|
/// By default, this is set to `b'\n'`.
|
||||||
|
pub fn line_terminator(&mut self, lineterm: u8) -> &mut LineBufferBuilder {
|
||||||
|
self.config.lineterm = lineterm;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the maximum amount of additional memory to allocate for long lines.
|
||||||
|
///
|
||||||
|
/// In order to enable line oriented search, a fundamental requirement is
|
||||||
|
/// that, at a minimum, each line must be able to fit into memory. This
|
||||||
|
/// setting controls how big that line is allowed to be. By default, this
|
||||||
|
/// is set to `BufferAllocation::Eager`, which means a line buffer will
|
||||||
|
/// attempt to allocate as much memory as possible to fit a line, and will
|
||||||
|
/// only be limited by available memory.
|
||||||
|
///
|
||||||
|
/// Note that this setting only applies to the amount of *additional*
|
||||||
|
/// memory to allocate, beyond the capacity of the buffer. That means that
|
||||||
|
/// a value of `0` is sensible, and in particular, will guarantee that a
|
||||||
|
/// line buffer will never allocate additional memory beyond its initial
|
||||||
|
/// capacity.
|
||||||
|
pub fn buffer_alloc(
|
||||||
|
&mut self,
|
||||||
|
behavior: BufferAllocation,
|
||||||
|
) -> &mut LineBufferBuilder {
|
||||||
|
self.config.buffer_alloc = behavior;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable binary detection or not. Depending on the setting,
|
||||||
|
/// this can either cause the line buffer to report EOF early or it can
|
||||||
|
/// cause the line buffer to clean the data.
|
||||||
|
///
|
||||||
|
/// By default, this is disabled. In general, binary detection should be
|
||||||
|
/// viewed as an imperfect heuristic.
|
||||||
|
pub fn binary_detection(
|
||||||
|
&mut self,
|
||||||
|
detection: BinaryDetection,
|
||||||
|
) -> &mut LineBufferBuilder {
|
||||||
|
self.config.binary = detection;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A line buffer reader efficiently reads a line oriented buffer from an
|
||||||
|
/// arbitrary reader.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct LineBufferReader<'b, R> {
|
||||||
|
rdr: R,
|
||||||
|
line_buffer: &'b mut LineBuffer,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'b, R: io::Read> LineBufferReader<'b, R> {
|
||||||
|
/// Create a new buffered reader that reads from `rdr` and uses the given
|
||||||
|
/// `line_buffer` as an intermediate buffer.
|
||||||
|
///
|
||||||
|
/// This does not change the binary detection behavior of the given line
|
||||||
|
/// buffer.
|
||||||
|
pub fn new(
|
||||||
|
rdr: R,
|
||||||
|
line_buffer: &'b mut LineBuffer,
|
||||||
|
) -> LineBufferReader<'b, R> {
|
||||||
|
line_buffer.clear();
|
||||||
|
LineBufferReader { rdr, line_buffer }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The absolute byte offset which corresponds to the starting offsets
|
||||||
|
/// of the data returned by `buffer` relative to the beginning of the
|
||||||
|
/// underlying reader's contents. As such, this offset does not generally
|
||||||
|
/// correspond to an offset in memory. It is typically used for reporting
|
||||||
|
/// purposes. It can also be used for counting the number of bytes that
|
||||||
|
/// have been searched.
|
||||||
|
pub fn absolute_byte_offset(&self) -> u64 {
|
||||||
|
self.line_buffer.absolute_byte_offset()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If binary data was detected, then this returns the absolute byte offset
|
||||||
|
/// at which binary data was initially found.
|
||||||
|
pub fn binary_byte_offset(&self) -> Option<u64> {
|
||||||
|
self.line_buffer.binary_byte_offset()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill the contents of this buffer by discarding the part of the buffer
|
||||||
|
/// that has been consumed. The free space created by discarding the
|
||||||
|
/// consumed part of the buffer is then filled with new data from the
|
||||||
|
/// reader.
|
||||||
|
///
|
||||||
|
/// If EOF is reached, then `false` is returned. Otherwise, `true` is
|
||||||
|
/// returned. (Note that if this line buffer's binary detection is set to
|
||||||
|
/// `Quit`, then the presence of binary data will cause this buffer to
|
||||||
|
/// behave as if it had seen EOF at the first occurrence of binary data.)
|
||||||
|
///
|
||||||
|
/// This forwards any errors returned by the underlying reader, and will
|
||||||
|
/// also return an error if the buffer must be expanded past its allocation
|
||||||
|
/// limit, as governed by the buffer allocation strategy.
|
||||||
|
pub fn fill(&mut self) -> Result<bool, io::Error> {
|
||||||
|
self.line_buffer.fill(&mut self.rdr)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the contents of this buffer.
|
||||||
|
pub fn buffer(&self) -> &[u8] {
|
||||||
|
self.line_buffer.buffer()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the number of bytes provided. This must be less than or equal
|
||||||
|
/// to the number of bytes returned by `buffer`.
|
||||||
|
pub fn consume(&mut self, amt: usize) {
|
||||||
|
self.line_buffer.consume(amt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consumes the remainder of the buffer. Subsequent calls to `buffer` are
|
||||||
|
/// guaranteed to return an empty slice until the buffer is refilled.
|
||||||
|
///
|
||||||
|
/// This is a convenience function for `consume(buffer.len())`.
|
||||||
|
#[cfg(test)]
|
||||||
|
fn consume_all(&mut self) {
|
||||||
|
self.line_buffer.consume_all();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A line buffer manages a (typically fixed) buffer for holding lines.
|
||||||
|
///
|
||||||
|
/// Callers should create line buffers sparingly and reuse them when possible.
|
||||||
|
/// Line buffers cannot be used directly, but instead must be used via the
|
||||||
|
/// LineBufferReader.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct LineBuffer {
|
||||||
|
/// The configuration of this buffer.
|
||||||
|
config: Config,
|
||||||
|
/// The primary buffer with which to hold data.
|
||||||
|
buf: Vec<u8>,
|
||||||
|
/// The current position of this buffer. This is always a valid sliceable
|
||||||
|
/// index into `buf`, and its maximum value is the length of `buf`.
|
||||||
|
pos: usize,
|
||||||
|
/// The end position of searchable content in this buffer. This is either
|
||||||
|
/// set to just after the final line terminator in the buffer, or to just
|
||||||
|
/// after the end of the last byte emitted by the reader when the reader
|
||||||
|
/// has been exhausted.
|
||||||
|
last_lineterm: usize,
|
||||||
|
/// The end position of the buffer. This is always greater than or equal to
|
||||||
|
/// last_lineterm. The bytes between last_lineterm and end, if any, always
|
||||||
|
/// correspond to a partial line.
|
||||||
|
end: usize,
|
||||||
|
/// The absolute byte offset corresponding to `pos`. This is most typically
|
||||||
|
/// not a valid index into addressable memory, but rather, an offset that
|
||||||
|
/// is relative to all data that passes through a line buffer (since
|
||||||
|
/// construction or since the last time `clear` was called).
|
||||||
|
///
|
||||||
|
/// When the line buffer reaches EOF, this is set to the position just
|
||||||
|
/// after the last byte read from the underlying reader. That is, it
|
||||||
|
/// becomes the total count of bytes that have been read.
|
||||||
|
absolute_byte_offset: u64,
|
||||||
|
/// If binary data was found, this records the absolute byte offset at
|
||||||
|
/// which it was first detected.
|
||||||
|
binary_byte_offset: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LineBuffer {
|
||||||
|
/// Reset this buffer, such that it can be used with a new reader.
|
||||||
|
fn clear(&mut self) {
|
||||||
|
self.pos = 0;
|
||||||
|
self.last_lineterm = 0;
|
||||||
|
self.end = 0;
|
||||||
|
self.absolute_byte_offset = 0;
|
||||||
|
self.binary_byte_offset = None;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The absolute byte offset which corresponds to the starting offsets
|
||||||
|
/// of the data returned by `buffer` relative to the beginning of the
|
||||||
|
/// reader's contents. As such, this offset does not generally correspond
|
||||||
|
/// to an offset in memory. It is typically used for reporting purposes,
|
||||||
|
/// particularly in error messages.
|
||||||
|
///
|
||||||
|
/// This is reset to `0` when `clear` is called.
|
||||||
|
fn absolute_byte_offset(&self) -> u64 {
|
||||||
|
self.absolute_byte_offset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If binary data was detected, then this returns the absolute byte offset
|
||||||
|
/// at which binary data was initially found.
|
||||||
|
fn binary_byte_offset(&self) -> Option<u64> {
|
||||||
|
self.binary_byte_offset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the contents of this buffer.
|
||||||
|
fn buffer(&self) -> &[u8] {
|
||||||
|
&self.buf[self.pos..self.last_lineterm]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the contents of the free space beyond the end of the buffer as
|
||||||
|
/// a mutable slice.
|
||||||
|
fn free_buffer(&mut self) -> &mut [u8] {
|
||||||
|
&mut self.buf[self.end..]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consume the number of bytes provided. This must be less than or equal
|
||||||
|
/// to the number of bytes returned by `buffer`.
|
||||||
|
fn consume(&mut self, amt: usize) {
|
||||||
|
assert!(amt <= self.buffer().len());
|
||||||
|
self.pos += amt;
|
||||||
|
self.absolute_byte_offset += amt as u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Consumes the remainder of the buffer. Subsequent calls to `buffer` are
|
||||||
|
/// guaranteed to return an empty slice until the buffer is refilled.
|
||||||
|
///
|
||||||
|
/// This is a convenience function for `consume(buffer.len())`.
|
||||||
|
#[cfg(test)]
|
||||||
|
fn consume_all(&mut self) {
|
||||||
|
let amt = self.buffer().len();
|
||||||
|
self.consume(amt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill the contents of this buffer by discarding the part of the buffer
|
||||||
|
/// that has been consumed. The free space created by discarding the
|
||||||
|
/// consumed part of the buffer is then filled with new data from the given
|
||||||
|
/// reader.
|
||||||
|
///
|
||||||
|
/// Callers should provide the same reader to this line buffer in
|
||||||
|
/// subsequent calls to fill. A different reader can only be used
|
||||||
|
/// immediately following a call to `clear`.
|
||||||
|
///
|
||||||
|
/// If EOF is reached, then `false` is returned. Otherwise, `true` is
|
||||||
|
/// returned. (Note that if this line buffer's binary detection is set to
|
||||||
|
/// `Quit`, then the presence of binary data will cause this buffer to
|
||||||
|
/// behave as if it had seen EOF.)
|
||||||
|
///
|
||||||
|
/// This forwards any errors returned by `rdr`, and will also return an
|
||||||
|
/// error if the buffer must be expanded past its allocation limit, as
|
||||||
|
/// governed by the buffer allocation strategy.
|
||||||
|
fn fill<R: io::Read>(&mut self, mut rdr: R) -> Result<bool, io::Error> {
|
||||||
|
// If the binary detection heuristic tells us to quit once binary data
|
||||||
|
// has been observed, then we no longer read new data and reach EOF
|
||||||
|
// once the current buffer has been consumed.
|
||||||
|
if self.config.binary.is_quit() && self.binary_byte_offset.is_some() {
|
||||||
|
return Ok(!self.buffer().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
self.roll();
|
||||||
|
assert_eq!(self.pos, 0);
|
||||||
|
loop {
|
||||||
|
self.ensure_capacity()?;
|
||||||
|
let readlen = rdr.read(self.free_buffer())?;
|
||||||
|
if readlen == 0 {
|
||||||
|
// We're only done reading for good once the caller has
|
||||||
|
// consumed everything.
|
||||||
|
self.last_lineterm = self.end;
|
||||||
|
return Ok(!self.buffer().is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get a mutable view into the bytes we've just read. These are
|
||||||
|
// the bytes that we do binary detection on, and also the bytes we
|
||||||
|
// search to find the last line terminator. We need a mutable slice
|
||||||
|
// in the case of binary conversion.
|
||||||
|
let oldend = self.end;
|
||||||
|
self.end += readlen;
|
||||||
|
let newbytes = &mut self.buf[oldend..self.end];
|
||||||
|
|
||||||
|
// Binary detection.
|
||||||
|
match self.config.binary {
|
||||||
|
BinaryDetection::None => {} // nothing to do
|
||||||
|
BinaryDetection::Quit(byte) => {
|
||||||
|
if let Some(i) = memchr(byte, newbytes) {
|
||||||
|
self.end = oldend + i;
|
||||||
|
self.last_lineterm = self.end;
|
||||||
|
self.binary_byte_offset =
|
||||||
|
Some(self.absolute_byte_offset + self.end as u64);
|
||||||
|
// If the first byte in our buffer is a binary byte,
|
||||||
|
// then our buffer is empty and we should report as
|
||||||
|
// such to the caller.
|
||||||
|
return Ok(self.pos < self.end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
BinaryDetection::Convert(byte) => {
|
||||||
|
if let Some(i) = replace_bytes(
|
||||||
|
newbytes,
|
||||||
|
byte,
|
||||||
|
self.config.lineterm,
|
||||||
|
) {
|
||||||
|
// Record only the first binary offset.
|
||||||
|
if self.binary_byte_offset.is_none() {
|
||||||
|
self.binary_byte_offset =
|
||||||
|
Some(self.absolute_byte_offset
|
||||||
|
+ (oldend + i) as u64);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update our `last_lineterm` positions if we read one.
|
||||||
|
if let Some(i) = memrchr(self.config.lineterm, newbytes) {
|
||||||
|
self.last_lineterm = oldend + i + 1;
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
// At this point, if we couldn't find a line terminator, then we
|
||||||
|
// don't have a complete line. Therefore, we try to read more!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Roll the unconsumed parts of the buffer to the front.
|
||||||
|
///
|
||||||
|
/// This operation is idempotent.
|
||||||
|
///
|
||||||
|
/// After rolling, `last_lineterm` and `end` point to the same location,
|
||||||
|
/// and `pos` is always set to `0`.
|
||||||
|
fn roll(&mut self) {
|
||||||
|
if self.pos == self.end {
|
||||||
|
self.pos = 0;
|
||||||
|
self.last_lineterm = 0;
|
||||||
|
self.end = 0;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(self.pos < self.end && self.end <= self.buf.len());
|
||||||
|
let roll_len = self.end - self.pos;
|
||||||
|
unsafe {
|
||||||
|
// SAFETY: A buffer contains Copy data, so there's no problem
|
||||||
|
// moving it around. Safety also depends on our indices being
|
||||||
|
// in bounds, which they should always be, and we enforce with
|
||||||
|
// an assert above.
|
||||||
|
//
|
||||||
|
// It seems like it should be possible to do this in safe code that
|
||||||
|
// results in the same codegen. I tried the obvious:
|
||||||
|
//
|
||||||
|
// for (src, dst) in (self.pos..self.end).zip(0..) {
|
||||||
|
// self.buf[dst] = self.buf[src];
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// But the above does not work, and in fact compiles down to a slow
|
||||||
|
// byte-by-byte loop. I tried a few other minor variations, but
|
||||||
|
// alas, better minds might prevail.
|
||||||
|
//
|
||||||
|
// Overall, this doesn't save us *too* much. It mostly matters when
|
||||||
|
// the number of bytes we're copying is large, which can happen
|
||||||
|
// if the searcher is asked to produce a lot of context. We could
|
||||||
|
// decide this isn't worth it, but it does make an appreciable
|
||||||
|
// impact at or around the context=30 range on my machine.
|
||||||
|
//
|
||||||
|
// We could also use a temporary buffer that compiles down to two
|
||||||
|
// memcpys and is faster than the byte-at-a-time loop, but it
|
||||||
|
// complicates our options for limiting memory allocation a bit.
|
||||||
|
ptr::copy(
|
||||||
|
self.buf[self.pos..].as_ptr(),
|
||||||
|
self.buf.as_mut_ptr(),
|
||||||
|
roll_len,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
self.pos = 0;
|
||||||
|
self.last_lineterm = roll_len;
|
||||||
|
self.end = roll_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Ensures that the internal buffer has a non-zero amount of free space
|
||||||
|
/// in which to read more data. If there is no free space, then more is
|
||||||
|
/// allocated. If the allocation must exceed the configured limit, then
|
||||||
|
/// this returns an error.
|
||||||
|
fn ensure_capacity(&mut self) -> Result<(), io::Error> {
|
||||||
|
if !self.free_buffer().is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
// `len` is used for computing the next allocation size. The capacity
|
||||||
|
// is permitted to start at `0`, so we make sure it's at least `1`.
|
||||||
|
let len = cmp::max(1, self.buf.len());
|
||||||
|
let additional = match self.config.buffer_alloc {
|
||||||
|
BufferAllocation::Eager => len * 2,
|
||||||
|
BufferAllocation::Error(limit) => {
|
||||||
|
let used = self.buf.len() - self.config.capacity;
|
||||||
|
let n = cmp::min(len * 2, limit - used);
|
||||||
|
if n == 0 {
|
||||||
|
return Err(alloc_error(self.config.capacity + limit));
|
||||||
|
}
|
||||||
|
n
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assert!(additional > 0);
|
||||||
|
let newlen = self.buf.len() + additional;
|
||||||
|
self.buf.resize(newlen, 0);
|
||||||
|
assert!(!self.free_buffer().is_empty());
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Replaces `src` with `replacement` in bytes.
|
||||||
|
fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option<usize> {
|
||||||
|
if src == replacement {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let mut first_pos = None;
|
||||||
|
let mut pos = 0;
|
||||||
|
while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) {
|
||||||
|
if first_pos.is_none() {
|
||||||
|
first_pos = Some(i);
|
||||||
|
}
|
||||||
|
bytes[i] = replacement;
|
||||||
|
pos = i + 1;
|
||||||
|
while bytes.get(pos) == Some(&src) {
|
||||||
|
bytes[pos] = replacement;
|
||||||
|
pos += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
first_pos
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::str;
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
const SHERLOCK: &'static str = "\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, success in the province of detective work must always
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
and exhibited clearly, with a label attached.\
|
||||||
|
";
|
||||||
|
|
||||||
|
fn s(slice: &str) -> String {
|
||||||
|
slice.to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn btos(slice: &[u8]) -> &str {
|
||||||
|
str::from_utf8(slice).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn replace_str(
|
||||||
|
slice: &str,
|
||||||
|
src: u8,
|
||||||
|
replacement: u8,
|
||||||
|
) -> (String, Option<usize>) {
|
||||||
|
let mut dst = slice.to_string().into_bytes();
|
||||||
|
let result = replace_bytes(&mut dst, src, replacement);
|
||||||
|
(String::from_utf8(dst).unwrap(), result)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn replace() {
|
||||||
|
assert_eq!(replace_str("abc", b'b', b'z'), (s("azc"), Some(1)));
|
||||||
|
assert_eq!(replace_str("abb", b'b', b'z'), (s("azz"), Some(1)));
|
||||||
|
assert_eq!(replace_str("aba", b'a', b'z'), (s("zbz"), Some(0)));
|
||||||
|
assert_eq!(replace_str("bbb", b'b', b'z'), (s("zzz"), Some(0)));
|
||||||
|
assert_eq!(replace_str("bac", b'b', b'z'), (s("zac"), Some(0)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_basics1() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nlisa\n");
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), 0);
|
||||||
|
rdr.consume(5);
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), 5);
|
||||||
|
rdr.consume_all();
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), 11);
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "maggie");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_basics2() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_basics3() {
|
||||||
|
let bytes = "\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_basics4() {
|
||||||
|
let bytes = "\n\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "\n\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_empty() {
|
||||||
|
let bytes = "";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_zero_capacity() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().capacity(0).build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
while rdr.fill().unwrap() {
|
||||||
|
rdr.consume_all();
|
||||||
|
}
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_small_capacity() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().capacity(1).build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
let mut got = vec![];
|
||||||
|
while rdr.fill().unwrap() {
|
||||||
|
got.extend(rdr.buffer());
|
||||||
|
rdr.consume_all();
|
||||||
|
}
|
||||||
|
assert_eq!(bytes, btos(&got));
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_limited_capacity1() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.capacity(1)
|
||||||
|
.buffer_alloc(BufferAllocation::Error(5))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "lisa\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
// This returns an error because while we have just enough room to
|
||||||
|
// store maggie in the buffer, we *don't* have enough room to read one
|
||||||
|
// more byte, so we don't know whether we're at EOF or not, and
|
||||||
|
// therefore must give up.
|
||||||
|
assert!(rdr.fill().is_err());
|
||||||
|
|
||||||
|
// We can mush on though!
|
||||||
|
assert_eq!(btos(rdr.buffer()), "m");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "aggie");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_limited_capacity2() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.capacity(1)
|
||||||
|
.buffer_alloc(BufferAllocation::Error(6))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "lisa\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
// We have just enough space.
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "maggie");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_limited_capacity3() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.capacity(1)
|
||||||
|
.buffer_alloc(BufferAllocation::Error(0))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.fill().is_err());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_none() {
|
||||||
|
let bytes = "homer\nli\x00sa\nmaggie\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new().build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_quit1() {
|
||||||
|
let bytes = "homer\nli\x00sa\nmaggie\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Quit(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nli");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), 8);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(8));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_quit2() {
|
||||||
|
let bytes = "\x00homer\nlisa\nmaggie\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Quit(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "");
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), 0);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_quit3() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie\n\x00";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Quit(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 1);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_quit4() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie\x00\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Quit(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 2);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_quit5() {
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Quit(b'u'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(SHERLOCK.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, s\
|
||||||
|
");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), 76);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(76));
|
||||||
|
assert_eq!(SHERLOCK.as_bytes()[76], b'u');
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_convert1() {
|
||||||
|
let bytes = "homer\nli\x00sa\nmaggie\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Convert(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(8));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_convert2() {
|
||||||
|
let bytes = "\x00homer\nlisa\nmaggie\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Convert(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_convert3() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie\n\x00";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Convert(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn buffer_binary_convert4() {
|
||||||
|
let bytes = "homer\nlisa\nmaggie\x00\n";
|
||||||
|
let mut linebuf = LineBufferBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::Convert(b'\x00'))
|
||||||
|
.build();
|
||||||
|
let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf);
|
||||||
|
|
||||||
|
assert!(rdr.buffer().is_empty());
|
||||||
|
|
||||||
|
assert!(rdr.fill().unwrap());
|
||||||
|
assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n");
|
||||||
|
rdr.consume_all();
|
||||||
|
|
||||||
|
assert!(!rdr.fill().unwrap());
|
||||||
|
assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64);
|
||||||
|
assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2));
|
||||||
|
}
|
||||||
|
}
|
||||||
462
grep-searcher/src/lines.rs
Normal file
462
grep-searcher/src/lines.rs
Normal file
@@ -0,0 +1,462 @@
|
|||||||
|
/*!
|
||||||
|
A collection of routines for performing operations on lines.
|
||||||
|
*/
|
||||||
|
|
||||||
|
use bytecount;
|
||||||
|
use memchr::{memchr, memrchr};
|
||||||
|
use grep_matcher::{LineTerminator, Match};
|
||||||
|
|
||||||
|
/// An iterator over lines in a particular slice of bytes.
|
||||||
|
///
|
||||||
|
/// Line terminators are considered part of the line they terminate. All lines
|
||||||
|
/// yielded by the iterator are guaranteed to be non-empty.
|
||||||
|
///
|
||||||
|
/// `'b` refers to the lifetime of the underlying bytes.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct LineIter<'b> {
|
||||||
|
bytes: &'b [u8],
|
||||||
|
stepper: LineStep,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'b> LineIter<'b> {
|
||||||
|
/// Create a new line iterator that yields lines in the given bytes that
|
||||||
|
/// are terminated by `line_term`.
|
||||||
|
pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> {
|
||||||
|
LineIter {
|
||||||
|
bytes: bytes,
|
||||||
|
stepper: LineStep::new(line_term, 0, bytes.len()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'b> Iterator for LineIter<'b> {
|
||||||
|
type Item = &'b [u8];
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<&'b [u8]> {
|
||||||
|
self.stepper.next_match(self.bytes).map(|m| &self.bytes[m])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An explicit iterator over lines in a particular slice of bytes.
|
||||||
|
///
|
||||||
|
/// This iterator avoids borrowing the bytes themselves, and instead requires
|
||||||
|
/// callers to explicitly provide the bytes when moving through the iterator.
|
||||||
|
/// While not idiomatic, this provides a simple way of iterating over lines
|
||||||
|
/// that doesn't require borrowing the slice itself, which can be convenient.
|
||||||
|
///
|
||||||
|
/// Line terminators are considered part of the line they terminate. All lines
|
||||||
|
/// yielded by the iterator are guaranteed to be non-empty.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct LineStep {
|
||||||
|
line_term: u8,
|
||||||
|
pos: usize,
|
||||||
|
end: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LineStep {
|
||||||
|
/// Create a new line iterator over the given range of bytes using the
|
||||||
|
/// given line terminator.
|
||||||
|
///
|
||||||
|
/// Callers should provide the actual bytes for each call to `next`. The
|
||||||
|
/// same slice must be provided to each call.
|
||||||
|
///
|
||||||
|
/// This panics if `start` is not less than or equal to `end`.
|
||||||
|
pub fn new(line_term: u8, start: usize, end: usize) -> LineStep {
|
||||||
|
LineStep { line_term, pos: start, end: end }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the start and end position of the next line in the given bytes.
|
||||||
|
///
|
||||||
|
/// The caller must past exactly the same slice of bytes for each call to
|
||||||
|
/// `next`.
|
||||||
|
///
|
||||||
|
/// The range returned includes the line terminator. Ranges are always
|
||||||
|
/// non-empty.
|
||||||
|
pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> {
|
||||||
|
self.next_impl(bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Like next, but returns a `Match` instead of a tuple.
|
||||||
|
#[inline(always)]
|
||||||
|
pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option<Match> {
|
||||||
|
self.next_impl(bytes).map(|(s, e)| Match::new(s, e))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> {
|
||||||
|
bytes = &bytes[..self.end];
|
||||||
|
match memchr(self.line_term, &bytes[self.pos..]) {
|
||||||
|
None => {
|
||||||
|
if self.pos < bytes.len() {
|
||||||
|
let m = (self.pos, bytes.len());
|
||||||
|
assert!(m.0 <= m.1);
|
||||||
|
|
||||||
|
self.pos = m.1;
|
||||||
|
Some(m)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(line_end) => {
|
||||||
|
let m = (self.pos, self.pos + line_end + 1);
|
||||||
|
assert!(m.0 <= m.1);
|
||||||
|
|
||||||
|
self.pos = m.1;
|
||||||
|
Some(m)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Count the number of occurrences of `line_term` in `bytes`.
|
||||||
|
pub fn count(bytes: &[u8], line_term: u8) -> u64 {
|
||||||
|
bytecount::count(bytes, line_term) as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Given a line that possibly ends with a terminator, return that line without
|
||||||
|
/// the terminator.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] {
|
||||||
|
let line_term = line_term.as_bytes();
|
||||||
|
let start = bytes.len().saturating_sub(line_term.len());
|
||||||
|
if bytes.get(start..) == Some(line_term) {
|
||||||
|
return &bytes[..bytes.len() - line_term.len()];
|
||||||
|
}
|
||||||
|
bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the start and end offsets of the lines containing the given range
|
||||||
|
/// of bytes.
|
||||||
|
///
|
||||||
|
/// Line terminators are considered part of the line they terminate.
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn locate(
|
||||||
|
bytes: &[u8],
|
||||||
|
line_term: u8,
|
||||||
|
range: Match,
|
||||||
|
) -> Match {
|
||||||
|
let line_start = memrchr(line_term, &bytes[0..range.start()])
|
||||||
|
.map_or(0, |i| i + 1);
|
||||||
|
let line_end =
|
||||||
|
if range.end() > line_start && bytes[range.end() - 1] == line_term {
|
||||||
|
range.end()
|
||||||
|
} else {
|
||||||
|
memchr(line_term, &bytes[range.end()..])
|
||||||
|
.map_or(bytes.len(), |i| range.end() + i + 1)
|
||||||
|
};
|
||||||
|
Match::new(line_start, line_end)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the minimal starting offset of the line that occurs `count` lines
|
||||||
|
/// before the last line in `bytes`.
|
||||||
|
///
|
||||||
|
/// Lines are terminated by `line_term`. If `count` is zero, then this returns
|
||||||
|
/// the starting offset of the last line in `bytes`.
|
||||||
|
///
|
||||||
|
/// If `bytes` ends with a line terminator, then the terminator itself is
|
||||||
|
/// considered part of the last line.
|
||||||
|
pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize {
|
||||||
|
preceding_by_pos(bytes, bytes.len(), line_term, count)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the minimal starting offset of the line that occurs `count` lines
|
||||||
|
/// before the line containing `pos`. Lines are terminated by `line_term`.
|
||||||
|
/// If `count` is zero, then this returns the starting offset of the line
|
||||||
|
/// containing `pos`.
|
||||||
|
///
|
||||||
|
/// If `pos` points just past a line terminator, then it is considered part of
|
||||||
|
/// the line that it terminates. For example, given `bytes = b"abc\nxyz\n"`
|
||||||
|
/// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos
|
||||||
|
/// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`.
|
||||||
|
fn preceding_by_pos(
|
||||||
|
bytes: &[u8],
|
||||||
|
mut pos: usize,
|
||||||
|
line_term: u8,
|
||||||
|
mut count: usize,
|
||||||
|
) -> usize {
|
||||||
|
if pos == 0 {
|
||||||
|
return 0;
|
||||||
|
} else if bytes[pos - 1] == line_term {
|
||||||
|
pos -= 1;
|
||||||
|
}
|
||||||
|
loop {
|
||||||
|
match memrchr(line_term, &bytes[..pos]) {
|
||||||
|
None => {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
Some(i) => {
|
||||||
|
if count == 0 {
|
||||||
|
return i + 1;
|
||||||
|
} else if i == 0 {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
count -= 1;
|
||||||
|
pos = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::ops::Range;
|
||||||
|
use std::str;
|
||||||
|
use grep_matcher::Match;
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
const SHERLOCK: &'static str = "\
|
||||||
|
For the Doctor Watsons of this world, as opposed to the Sherlock
|
||||||
|
Holmeses, success in the province of detective work must always
|
||||||
|
be, to a very large extent, the result of luck. Sherlock Holmes
|
||||||
|
can extract a clew from a wisp of straw or a flake of cigar ash;
|
||||||
|
but Doctor Watson has to have it taken out for him and dusted,
|
||||||
|
and exhibited clearly, with a label attached.\
|
||||||
|
";
|
||||||
|
|
||||||
|
fn m(start: usize, end: usize) -> Match {
|
||||||
|
Match::new(start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn lines(text: &str) -> Vec<&str> {
|
||||||
|
let mut results = vec![];
|
||||||
|
let mut it = LineStep::new(b'\n', 0, text.len());
|
||||||
|
while let Some(m) = it.next_match(text.as_bytes()) {
|
||||||
|
results.push(&text[m]);
|
||||||
|
}
|
||||||
|
results
|
||||||
|
}
|
||||||
|
|
||||||
|
fn line_ranges(text: &str) -> Vec<Range<usize>> {
|
||||||
|
let mut results = vec![];
|
||||||
|
let mut it = LineStep::new(b'\n', 0, text.len());
|
||||||
|
while let Some(m) = it.next_match(text.as_bytes()) {
|
||||||
|
results.push(m.start()..m.end());
|
||||||
|
}
|
||||||
|
results
|
||||||
|
}
|
||||||
|
|
||||||
|
fn prev(text: &str, pos: usize, count: usize) -> usize {
|
||||||
|
preceding_by_pos(text.as_bytes(), pos, b'\n', count)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn loc(text: &str, start: usize, end: usize) -> Match {
|
||||||
|
locate(text.as_bytes(), b'\n', Match::new(start, end))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn line_count() {
|
||||||
|
assert_eq!(0, count(b"", b'\n'));
|
||||||
|
assert_eq!(1, count(b"\n", b'\n'));
|
||||||
|
assert_eq!(2, count(b"\n\n", b'\n'));
|
||||||
|
assert_eq!(2, count(b"a\nb\nc", b'\n'));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn line_locate() {
|
||||||
|
let t = SHERLOCK;
|
||||||
|
let lines = line_ranges(t);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[0].start, lines[0].end),
|
||||||
|
m(lines[0].start, lines[0].end));
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[0].start + 1, lines[0].end),
|
||||||
|
m(lines[0].start, lines[0].end));
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[0].end - 1, lines[0].end),
|
||||||
|
m(lines[0].start, lines[0].end));
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[0].end, lines[0].end),
|
||||||
|
m(lines[1].start, lines[1].end));
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[5].start, lines[5].end),
|
||||||
|
m(lines[5].start, lines[5].end));
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[5].start + 1, lines[5].end),
|
||||||
|
m(lines[5].start, lines[5].end));
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[5].end - 1, lines[5].end),
|
||||||
|
m(lines[5].start, lines[5].end));
|
||||||
|
assert_eq!(
|
||||||
|
loc(t, lines[5].end, lines[5].end),
|
||||||
|
m(lines[5].start, lines[5].end));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn line_locate_weird() {
|
||||||
|
assert_eq!(loc("", 0, 0), m(0, 0));
|
||||||
|
|
||||||
|
assert_eq!(loc("\n", 0, 1), m(0, 1));
|
||||||
|
assert_eq!(loc("\n", 1, 1), m(1, 1));
|
||||||
|
|
||||||
|
assert_eq!(loc("\n\n", 0, 0), m(0, 1));
|
||||||
|
assert_eq!(loc("\n\n", 0, 1), m(0, 1));
|
||||||
|
assert_eq!(loc("\n\n", 1, 1), m(1, 2));
|
||||||
|
assert_eq!(loc("\n\n", 1, 2), m(1, 2));
|
||||||
|
assert_eq!(loc("\n\n", 2, 2), m(2, 2));
|
||||||
|
|
||||||
|
assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2));
|
||||||
|
assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2));
|
||||||
|
assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4));
|
||||||
|
assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4));
|
||||||
|
assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5));
|
||||||
|
assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn line_iter() {
|
||||||
|
assert_eq!(lines("abc"), vec!["abc"]);
|
||||||
|
|
||||||
|
assert_eq!(lines("abc\n"), vec!["abc\n"]);
|
||||||
|
assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]);
|
||||||
|
assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
|
||||||
|
|
||||||
|
assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]);
|
||||||
|
assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]);
|
||||||
|
assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]);
|
||||||
|
assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]);
|
||||||
|
assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]);
|
||||||
|
|
||||||
|
assert_eq!(lines("\n"), vec!["\n"]);
|
||||||
|
assert_eq!(lines(""), Vec::<&str>::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn line_iter_empty() {
|
||||||
|
let mut it = LineStep::new(b'\n', 0, 0);
|
||||||
|
assert_eq!(it.next(b"abc"), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preceding_lines_doc() {
|
||||||
|
// These are the examples mentions in the documentation of `preceding`.
|
||||||
|
let bytes = b"abc\nxyz\n";
|
||||||
|
assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0));
|
||||||
|
assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0));
|
||||||
|
assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1));
|
||||||
|
assert_eq!(0, preceding_by_pos(bytes, 8, b'\n', 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preceding_lines_sherlock() {
|
||||||
|
let t = SHERLOCK;
|
||||||
|
let lines = line_ranges(t);
|
||||||
|
|
||||||
|
// The following tests check the count == 0 case, i.e., finding the
|
||||||
|
// beginning of the line containing the given position.
|
||||||
|
assert_eq!(0, prev(t, 0, 0));
|
||||||
|
assert_eq!(0, prev(t, 1, 0));
|
||||||
|
// The line terminator is addressed by `end-1` and terminates the line
|
||||||
|
// it is part of.
|
||||||
|
assert_eq!(0, prev(t, lines[0].end - 1, 0));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
|
||||||
|
// The end position of line addresses the byte immediately following a
|
||||||
|
// line terminator, which puts it on the following line.
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[0].end + 1, 0));
|
||||||
|
|
||||||
|
// Now tests for count > 0.
|
||||||
|
assert_eq!(0, prev(t, 0, 1));
|
||||||
|
assert_eq!(0, prev(t, 0, 2));
|
||||||
|
assert_eq!(0, prev(t, 1, 1));
|
||||||
|
assert_eq!(0, prev(t, 1, 2));
|
||||||
|
assert_eq!(0, prev(t, lines[0].end - 1, 1));
|
||||||
|
assert_eq!(0, prev(t, lines[0].end - 1, 2));
|
||||||
|
assert_eq!(0, prev(t, lines[0].end, 1));
|
||||||
|
assert_eq!(0, prev(t, lines[0].end, 2));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[4].end, 1));
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[4].end + 1, 1));
|
||||||
|
|
||||||
|
// The last line has no line terminator.
|
||||||
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
||||||
|
assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preceding_lines_short() {
|
||||||
|
let t = "a\nb\nc\nd\ne\nf\n";
|
||||||
|
let lines = line_ranges(t);
|
||||||
|
assert_eq!(12, t.len());
|
||||||
|
|
||||||
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
|
||||||
|
|
||||||
|
assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0));
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[5].end - 1, 1));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[5].end - 1, 2));
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[5].end - 1, 3));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[5].end - 1, 4));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 5));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 6));
|
||||||
|
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[5].start, 0));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[5].start, 1));
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[5].start, 2));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[5].start, 3));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].start, 4));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].start, 5));
|
||||||
|
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1));
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[4].start, 1));
|
||||||
|
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[3].end - 1, 1));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[3].start, 1));
|
||||||
|
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[2].end - 1, 1));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[2].start, 1));
|
||||||
|
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[1].end - 1, 1));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[1].start, 1));
|
||||||
|
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].end - 1, 1));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].start, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preceding_lines_empty1() {
|
||||||
|
let t = "\n\n\nd\ne\nf\n";
|
||||||
|
let lines = line_ranges(t);
|
||||||
|
assert_eq!(9, t.len());
|
||||||
|
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
|
||||||
|
|
||||||
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn preceding_lines_empty2() {
|
||||||
|
let t = "a\n\n\nd\ne\nf\n";
|
||||||
|
let lines = line_ranges(t);
|
||||||
|
assert_eq!(10, t.len());
|
||||||
|
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 0));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[0].end, 1));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[1].end, 0));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[1].end, 1));
|
||||||
|
|
||||||
|
assert_eq!(lines[5].start, prev(t, lines[5].end, 0));
|
||||||
|
assert_eq!(lines[4].start, prev(t, lines[5].end, 1));
|
||||||
|
assert_eq!(lines[3].start, prev(t, lines[5].end, 2));
|
||||||
|
assert_eq!(lines[2].start, prev(t, lines[5].end, 3));
|
||||||
|
assert_eq!(lines[1].start, prev(t, lines[5].end, 4));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 5));
|
||||||
|
assert_eq!(lines[0].start, prev(t, lines[5].end, 6));
|
||||||
|
}
|
||||||
|
}
|
||||||
25
grep-searcher/src/macros.rs
Normal file
25
grep-searcher/src/macros.rs
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
/// Like assert_eq, but nicer output for long strings.
|
||||||
|
#[cfg(test)]
|
||||||
|
#[macro_export]
|
||||||
|
macro_rules! assert_eq_printed {
|
||||||
|
($expected:expr, $got:expr, $($tt:tt)*) => {
|
||||||
|
let expected = &*$expected;
|
||||||
|
let got = &*$got;
|
||||||
|
let label = format!($($tt)*);
|
||||||
|
if expected != got {
|
||||||
|
panic!("
|
||||||
|
printed outputs differ! (label: {})
|
||||||
|
|
||||||
|
expected:
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
{}
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
got:
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
{}
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
", label, expected, got);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
583
grep-searcher/src/searcher/core.rs
Normal file
583
grep-searcher/src/searcher/core.rs
Normal file
@@ -0,0 +1,583 @@
|
|||||||
|
use std::cmp;
|
||||||
|
|
||||||
|
use memchr::memchr;
|
||||||
|
|
||||||
|
use grep_matcher::{LineMatchKind, Matcher};
|
||||||
|
use lines::{self, LineStep};
|
||||||
|
use line_buffer::BinaryDetection;
|
||||||
|
use searcher::{Config, Range, Searcher};
|
||||||
|
use sink::{
|
||||||
|
Sink, SinkError,
|
||||||
|
SinkFinish, SinkContext, SinkContextKind, SinkMatch,
|
||||||
|
};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Core<'s, M: 's, S> {
|
||||||
|
config: &'s Config,
|
||||||
|
matcher: M,
|
||||||
|
searcher: &'s Searcher,
|
||||||
|
sink: S,
|
||||||
|
binary: bool,
|
||||||
|
pos: usize,
|
||||||
|
absolute_byte_offset: u64,
|
||||||
|
binary_byte_offset: Option<usize>,
|
||||||
|
line_number: Option<u64>,
|
||||||
|
last_line_counted: usize,
|
||||||
|
last_line_visited: usize,
|
||||||
|
after_context_left: usize,
|
||||||
|
has_sunk: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s, M: Matcher, S: Sink> Core<'s, M, S> {
|
||||||
|
pub fn new(
|
||||||
|
searcher: &'s Searcher,
|
||||||
|
matcher: M,
|
||||||
|
sink: S,
|
||||||
|
binary: bool,
|
||||||
|
) -> Core<'s, M, S> {
|
||||||
|
let line_number =
|
||||||
|
if searcher.config.line_number {
|
||||||
|
Some(1)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
let core = Core {
|
||||||
|
config: &searcher.config,
|
||||||
|
matcher: matcher,
|
||||||
|
searcher: searcher,
|
||||||
|
sink: sink,
|
||||||
|
binary: binary,
|
||||||
|
pos: 0,
|
||||||
|
absolute_byte_offset: 0,
|
||||||
|
binary_byte_offset: None,
|
||||||
|
line_number: line_number,
|
||||||
|
last_line_counted: 0,
|
||||||
|
last_line_visited: 0,
|
||||||
|
after_context_left: 0,
|
||||||
|
has_sunk: false,
|
||||||
|
};
|
||||||
|
if !core.searcher.multi_line_with_matcher(&core.matcher) {
|
||||||
|
if core.is_line_by_line_fast() {
|
||||||
|
trace!("searcher core: will use fast line searcher");
|
||||||
|
} else {
|
||||||
|
trace!("searcher core: will use slow line searcher");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
core
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn pos(&self) -> usize {
|
||||||
|
self.pos
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_pos(&mut self, pos: usize) {
|
||||||
|
self.pos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn binary_byte_offset(&self) -> Option<u64> {
|
||||||
|
self.binary_byte_offset.map(|offset| offset as u64)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn matcher(&self) -> &M {
|
||||||
|
&self.matcher
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn matched(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
range: &Range,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
self.sink_matched(buf, range)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn begin(&mut self) -> Result<bool, S::Error> {
|
||||||
|
self.sink.begin(&self.searcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn finish(
|
||||||
|
&mut self,
|
||||||
|
byte_count: u64,
|
||||||
|
binary_byte_offset: Option<u64>,
|
||||||
|
) -> Result<(), S::Error> {
|
||||||
|
self.sink.finish(
|
||||||
|
&self.searcher,
|
||||||
|
&SinkFinish {
|
||||||
|
byte_count,
|
||||||
|
binary_byte_offset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn match_by_line(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
|
||||||
|
if self.is_line_by_line_fast() {
|
||||||
|
self.match_by_line_fast(buf)
|
||||||
|
} else {
|
||||||
|
self.match_by_line_slow(buf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn roll(&mut self, buf: &[u8]) -> usize {
|
||||||
|
let consumed =
|
||||||
|
if self.config.max_context() == 0 {
|
||||||
|
buf.len()
|
||||||
|
} else {
|
||||||
|
// It might seem like all we need to care about here is just
|
||||||
|
// the "before context," but in order to sink the context
|
||||||
|
// separator (when before_context==0 and after_context>0), we
|
||||||
|
// need to know something about the position of the previous
|
||||||
|
// line visited, even if we're at the beginning of the buffer.
|
||||||
|
let context_start = lines::preceding(
|
||||||
|
buf,
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
self.config.max_context(),
|
||||||
|
);
|
||||||
|
let consumed = cmp::max(context_start, self.last_line_visited);
|
||||||
|
consumed
|
||||||
|
};
|
||||||
|
self.count_lines(buf, consumed);
|
||||||
|
self.absolute_byte_offset += consumed as u64;
|
||||||
|
self.last_line_counted = 0;
|
||||||
|
self.last_line_visited = 0;
|
||||||
|
self.set_pos(buf.len() - consumed);
|
||||||
|
consumed
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn detect_binary(&mut self, buf: &[u8], range: &Range) -> bool {
|
||||||
|
if self.binary_byte_offset.is_some() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
let binary_byte = match self.config.binary.0 {
|
||||||
|
BinaryDetection::Quit(b) => b,
|
||||||
|
_ => return false,
|
||||||
|
};
|
||||||
|
if let Some(i) = memchr(binary_byte, &buf[*range]) {
|
||||||
|
self.binary_byte_offset = Some(range.start() + i);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn before_context_by_line(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
upto: usize,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
if self.config.before_context == 0 {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
let range = Range::new(self.last_line_visited, upto);
|
||||||
|
if range.is_empty() {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
let before_context_start = range.start() + lines::preceding(
|
||||||
|
&buf[range],
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
self.config.before_context - 1,
|
||||||
|
);
|
||||||
|
|
||||||
|
let range = Range::new(before_context_start, range.end());
|
||||||
|
let mut stepper = LineStep::new(
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
range.start(),
|
||||||
|
range.end(),
|
||||||
|
);
|
||||||
|
while let Some(line) = stepper.next_match(buf) {
|
||||||
|
if !self.sink_break_context(line.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
if !self.sink_before_context(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn after_context_by_line(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
upto: usize,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
if self.after_context_left == 0 {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
let range = Range::new(self.last_line_visited, upto);
|
||||||
|
let mut stepper = LineStep::new(
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
range.start(),
|
||||||
|
range.end(),
|
||||||
|
);
|
||||||
|
while let Some(line) = stepper.next_match(buf) {
|
||||||
|
if !self.sink_after_context(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
if self.after_context_left == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn other_context_by_line(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
upto: usize,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
let range = Range::new(self.last_line_visited, upto);
|
||||||
|
let mut stepper = LineStep::new(
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
range.start(),
|
||||||
|
range.end(),
|
||||||
|
);
|
||||||
|
while let Some(line) = stepper.next_match(buf) {
|
||||||
|
if !self.sink_other_context(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn match_by_line_slow(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
|
||||||
|
debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
|
||||||
|
|
||||||
|
let range = Range::new(self.pos(), buf.len());
|
||||||
|
let mut stepper = LineStep::new(
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
range.start(),
|
||||||
|
range.end(),
|
||||||
|
);
|
||||||
|
while let Some(line) = stepper.next_match(buf) {
|
||||||
|
let matched = {
|
||||||
|
// Stripping the line terminator is necessary to prevent some
|
||||||
|
// classes of regexes from matching the empty position *after*
|
||||||
|
// the end of the line. For example, `(?m)^$` will match at
|
||||||
|
// position (2, 2) in the string `a\n`.
|
||||||
|
let slice = lines::without_terminator(
|
||||||
|
&buf[line],
|
||||||
|
self.config.line_term,
|
||||||
|
);
|
||||||
|
match self.matcher.shortest_match(slice) {
|
||||||
|
Err(err) => return Err(S::Error::error_message(err)),
|
||||||
|
Ok(result) => result.is_some(),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
self.set_pos(line.end());
|
||||||
|
if matched != self.config.invert_match {
|
||||||
|
if !self.before_context_by_line(buf, line.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
if !self.sink_matched(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
} else if self.after_context_left >= 1 {
|
||||||
|
if !self.sink_after_context(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
} else if self.config.passthru {
|
||||||
|
if !self.sink_other_context(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn match_by_line_fast(&mut self, buf: &[u8]) -> Result<bool, S::Error> {
|
||||||
|
debug_assert!(!self.config.passthru);
|
||||||
|
|
||||||
|
while !buf[self.pos()..].is_empty() {
|
||||||
|
if self.config.invert_match {
|
||||||
|
if !self.match_by_line_fast_invert(buf)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
} else if let Some(line) = self.find_by_line_fast(buf)? {
|
||||||
|
if self.config.max_context() > 0 {
|
||||||
|
if !self.after_context_by_line(buf, line.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
if !self.before_context_by_line(buf, line.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.set_pos(line.end());
|
||||||
|
if !self.sink_matched(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !self.after_context_by_line(buf, buf.len())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.set_pos(buf.len());
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn match_by_line_fast_invert(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
assert!(self.config.invert_match);
|
||||||
|
|
||||||
|
let invert_match = match self.find_by_line_fast(buf)? {
|
||||||
|
None => {
|
||||||
|
let range = Range::new(self.pos(), buf.len());
|
||||||
|
self.set_pos(range.end());
|
||||||
|
range
|
||||||
|
}
|
||||||
|
Some(line) => {
|
||||||
|
let range = Range::new(self.pos(), line.start());
|
||||||
|
self.set_pos(line.end());
|
||||||
|
range
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if invert_match.is_empty() {
|
||||||
|
return Ok(true);
|
||||||
|
}
|
||||||
|
if !self.after_context_by_line(buf, invert_match.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
if !self.before_context_by_line(buf, invert_match.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
let mut stepper = LineStep::new(
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
invert_match.start(),
|
||||||
|
invert_match.end(),
|
||||||
|
);
|
||||||
|
while let Some(line) = stepper.next_match(buf) {
|
||||||
|
if !self.sink_matched(buf, &line)? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn find_by_line_fast(
|
||||||
|
&self,
|
||||||
|
buf: &[u8],
|
||||||
|
) -> Result<Option<Range>, S::Error> {
|
||||||
|
debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
|
||||||
|
debug_assert!(self.is_line_by_line_fast());
|
||||||
|
|
||||||
|
let mut pos = self.pos();
|
||||||
|
while !buf[pos..].is_empty() {
|
||||||
|
match self.matcher.find_candidate_line(&buf[pos..]) {
|
||||||
|
Err(err) => return Err(S::Error::error_message(err)),
|
||||||
|
Ok(None) => return Ok(None),
|
||||||
|
Ok(Some(LineMatchKind::Confirmed(i))) => {
|
||||||
|
let line = lines::locate(
|
||||||
|
buf,
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
Range::zero(i).offset(pos),
|
||||||
|
);
|
||||||
|
// If we matched beyond the end of the buffer, then we
|
||||||
|
// don't report this as a match.
|
||||||
|
if line.start() == buf.len() {
|
||||||
|
pos = buf.len();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
return Ok(Some(line));
|
||||||
|
}
|
||||||
|
Ok(Some(LineMatchKind::Candidate(i))) => {
|
||||||
|
let line = lines::locate(
|
||||||
|
buf,
|
||||||
|
self.config.line_term.as_byte(),
|
||||||
|
Range::zero(i).offset(pos),
|
||||||
|
);
|
||||||
|
// We need to strip the line terminator here to match the
|
||||||
|
// semantics of line-by-line searching. Namely, regexes
|
||||||
|
// like `(?m)^$` can match at the final position beyond a
|
||||||
|
// line terminator, which is non-sensical in line oriented
|
||||||
|
// matching.
|
||||||
|
let slice = lines::without_terminator(
|
||||||
|
&buf[line],
|
||||||
|
self.config.line_term,
|
||||||
|
);
|
||||||
|
match self.matcher.is_match(slice) {
|
||||||
|
Err(err) => return Err(S::Error::error_message(err)),
|
||||||
|
Ok(true) => return Ok(Some(line)),
|
||||||
|
Ok(false) => {
|
||||||
|
pos = line.end();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn sink_matched(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
range: &Range,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
if self.binary && self.detect_binary(buf, range) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
if !self.sink_break_context(range.start())? {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.count_lines(buf, range.start());
|
||||||
|
let offset = self.absolute_byte_offset + range.start() as u64;
|
||||||
|
let linebuf = &buf[*range];
|
||||||
|
let keepgoing = self.sink.matched(
|
||||||
|
&self.searcher,
|
||||||
|
&SinkMatch {
|
||||||
|
line_term: self.config.line_term,
|
||||||
|
bytes: linebuf,
|
||||||
|
absolute_byte_offset: offset,
|
||||||
|
line_number: self.line_number,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
if !keepgoing {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.last_line_visited = range.end();
|
||||||
|
self.after_context_left = self.config.after_context;
|
||||||
|
self.has_sunk = true;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sink_before_context(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
range: &Range,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
if self.binary && self.detect_binary(buf, range) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.count_lines(buf, range.start());
|
||||||
|
let offset = self.absolute_byte_offset + range.start() as u64;
|
||||||
|
let keepgoing = self.sink.context(
|
||||||
|
&self.searcher,
|
||||||
|
&SinkContext {
|
||||||
|
line_term: self.config.line_term,
|
||||||
|
bytes: &buf[*range],
|
||||||
|
kind: SinkContextKind::Before,
|
||||||
|
absolute_byte_offset: offset,
|
||||||
|
line_number: self.line_number,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
if !keepgoing {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.last_line_visited = range.end();
|
||||||
|
self.has_sunk = true;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sink_after_context(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
range: &Range,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
assert!(self.after_context_left >= 1);
|
||||||
|
|
||||||
|
if self.binary && self.detect_binary(buf, range) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.count_lines(buf, range.start());
|
||||||
|
let offset = self.absolute_byte_offset + range.start() as u64;
|
||||||
|
let keepgoing = self.sink.context(
|
||||||
|
&self.searcher,
|
||||||
|
&SinkContext {
|
||||||
|
line_term: self.config.line_term,
|
||||||
|
bytes: &buf[*range],
|
||||||
|
kind: SinkContextKind::After,
|
||||||
|
absolute_byte_offset: offset,
|
||||||
|
line_number: self.line_number,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
if !keepgoing {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.last_line_visited = range.end();
|
||||||
|
self.after_context_left -= 1;
|
||||||
|
self.has_sunk = true;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sink_other_context(
|
||||||
|
&mut self,
|
||||||
|
buf: &[u8],
|
||||||
|
range: &Range,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
if self.binary && self.detect_binary(buf, range) {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.count_lines(buf, range.start());
|
||||||
|
let offset = self.absolute_byte_offset + range.start() as u64;
|
||||||
|
let keepgoing = self.sink.context(
|
||||||
|
&self.searcher,
|
||||||
|
&SinkContext {
|
||||||
|
line_term: self.config.line_term,
|
||||||
|
bytes: &buf[*range],
|
||||||
|
kind: SinkContextKind::Other,
|
||||||
|
absolute_byte_offset: offset,
|
||||||
|
line_number: self.line_number,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
if !keepgoing {
|
||||||
|
return Ok(false);
|
||||||
|
}
|
||||||
|
self.last_line_visited = range.end();
|
||||||
|
self.has_sunk = true;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn sink_break_context(
|
||||||
|
&mut self,
|
||||||
|
start_of_line: usize,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
let is_gap = self.last_line_visited < start_of_line;
|
||||||
|
let any_context =
|
||||||
|
self.config.before_context > 0
|
||||||
|
|| self.config.after_context > 0;
|
||||||
|
|
||||||
|
if !any_context || !self.has_sunk || !is_gap {
|
||||||
|
Ok(true)
|
||||||
|
} else {
|
||||||
|
self.sink.context_break(&self.searcher)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn count_lines(&mut self, buf: &[u8], upto: usize) {
|
||||||
|
if let Some(ref mut line_number) = self.line_number {
|
||||||
|
if self.last_line_counted >= upto {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let slice = &buf[self.last_line_counted..upto];
|
||||||
|
let count = lines::count(slice, self.config.line_term.as_byte());
|
||||||
|
*line_number += count;
|
||||||
|
self.last_line_counted = upto;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_line_by_line_fast(&self) -> bool {
|
||||||
|
debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher));
|
||||||
|
|
||||||
|
if self.config.passthru {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if let Some(line_term) = self.matcher.line_terminator() {
|
||||||
|
if line_term == self.config.line_term {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(non_matching) = self.matcher.non_matching_bytes() {
|
||||||
|
// If the line terminator is CRLF, we don't actually need to care
|
||||||
|
// whether the regex can match `\r` or not. Namely, a `\r` is
|
||||||
|
// neither necessary nor sufficient to terminate a line. A `\n` is
|
||||||
|
// always required.
|
||||||
|
if non_matching.contains(self.config.line_term.as_byte()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
1506
grep-searcher/src/searcher/glue.rs
Normal file
1506
grep-searcher/src/searcher/glue.rs
Normal file
File diff suppressed because it is too large
Load Diff
106
grep-searcher/src/searcher/mmap.rs
Normal file
106
grep-searcher/src/searcher/mmap.rs
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
use std::fs::File;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use memmap::Mmap;
|
||||||
|
|
||||||
|
/// Controls the strategy used for determining when to use memory maps.
|
||||||
|
///
|
||||||
|
/// If a searcher is called in circumstances where it is possible to use memory
|
||||||
|
/// maps, and memory maps are enabled, then it will attempt to do so if it
|
||||||
|
/// believes it will make the search faster.
|
||||||
|
///
|
||||||
|
/// By default, memory maps are disabled.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct MmapChoice(MmapChoiceImpl);
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
enum MmapChoiceImpl {
|
||||||
|
Auto,
|
||||||
|
Never,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for MmapChoice {
|
||||||
|
fn default() -> MmapChoice {
|
||||||
|
MmapChoice(MmapChoiceImpl::Never)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MmapChoice {
|
||||||
|
/// Use memory maps when they are believed to be advantageous.
|
||||||
|
///
|
||||||
|
/// The heuristics used to determine whether to use a memory map or not
|
||||||
|
/// may depend on many things, including but not limited to, file size
|
||||||
|
/// and platform.
|
||||||
|
///
|
||||||
|
/// If memory maps are unavailable or cannot be used for a specific input,
|
||||||
|
/// then normal OS read calls are used instead.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
///
|
||||||
|
/// This constructor is not safe because there is no obvious way to
|
||||||
|
/// encapsulate the safety of file backed memory maps on all platforms
|
||||||
|
/// without simultaneously negating some or all of their benefits.
|
||||||
|
///
|
||||||
|
/// The specific contract the caller is required to uphold isn't precise,
|
||||||
|
/// but it basically amounts to something like, "the caller guarantees that
|
||||||
|
/// the underlying file won't be mutated." This, of course, isn't feasible
|
||||||
|
/// in many environments. However, command line tools may still decide to
|
||||||
|
/// take the risk of, say, a `SIGBUS` occurring while attempting to read a
|
||||||
|
/// memory map.
|
||||||
|
pub unsafe fn auto() -> MmapChoice {
|
||||||
|
MmapChoice(MmapChoiceImpl::Auto)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Never use memory maps, no matter what. This is the default.
|
||||||
|
pub fn never() -> MmapChoice {
|
||||||
|
MmapChoice(MmapChoiceImpl::Never)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return a memory map if memory maps are enabled and if creating a
|
||||||
|
/// memory from the given file succeeded and if memory maps are believed
|
||||||
|
/// to be advantageous for performance.
|
||||||
|
///
|
||||||
|
/// If this does attempt to open a memory map and it fails, then `None`
|
||||||
|
/// is returned and the corresponding error (along with the file path, if
|
||||||
|
/// present) is logged at the debug level.
|
||||||
|
pub(crate) fn open(
|
||||||
|
&self,
|
||||||
|
file: &File,
|
||||||
|
path: Option<&Path>,
|
||||||
|
) -> Option<Mmap> {
|
||||||
|
if !self.is_enabled() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if cfg!(target_os = "macos") {
|
||||||
|
// I guess memory maps on macOS aren't great. Should re-evaluate.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// SAFETY: This is acceptable because the only way `MmapChoiceImpl` can
|
||||||
|
// be `Auto` is if the caller invoked the `auto` constructor, which
|
||||||
|
// is itself not safe. Thus, this is a propagation of the caller's
|
||||||
|
// assertion that using memory maps is safe.
|
||||||
|
match unsafe { Mmap::map(file) } {
|
||||||
|
Ok(mmap) => Some(mmap),
|
||||||
|
Err(err) => {
|
||||||
|
if let Some(path) = path {
|
||||||
|
debug!(
|
||||||
|
"{}: failed to open memory map: {}",
|
||||||
|
path.display(),
|
||||||
|
err
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
debug!("failed to open memory map: {}", err);
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether this strategy may employ memory maps or not.
|
||||||
|
pub(crate) fn is_enabled(&self) -> bool {
|
||||||
|
match self.0 {
|
||||||
|
MmapChoiceImpl::Auto => true,
|
||||||
|
MmapChoiceImpl::Never => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
957
grep-searcher/src/searcher/mod.rs
Normal file
957
grep-searcher/src/searcher/mod.rs
Normal file
@@ -0,0 +1,957 @@
|
|||||||
|
use std::cell::RefCell;
|
||||||
|
use std::cmp;
|
||||||
|
use std::fmt;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, Read};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use encoding_rs;
|
||||||
|
use encoding_rs_io::DecodeReaderBytesBuilder;
|
||||||
|
use grep_matcher::{LineTerminator, Match, Matcher};
|
||||||
|
use line_buffer::{
|
||||||
|
self, BufferAllocation, LineBuffer, LineBufferBuilder, LineBufferReader,
|
||||||
|
DEFAULT_BUFFER_CAPACITY, alloc_error,
|
||||||
|
};
|
||||||
|
use searcher::glue::{ReadByLine, SliceByLine, MultiLine};
|
||||||
|
use sink::{Sink, SinkError};
|
||||||
|
|
||||||
|
pub use self::mmap::MmapChoice;
|
||||||
|
|
||||||
|
mod core;
|
||||||
|
mod glue;
|
||||||
|
mod mmap;
|
||||||
|
|
||||||
|
/// We use this type alias since we want the ergonomics of a matcher's `Match`
|
||||||
|
/// type, but in practice, we use it for arbitrary ranges, so give it a more
|
||||||
|
/// accurate name. This is only used in the searcher's internals.
|
||||||
|
type Range = Match;
|
||||||
|
|
||||||
|
/// The behavior of binary detection while searching.
|
||||||
|
///
|
||||||
|
/// Binary detection is the process of _heuristically_ identifying whether a
|
||||||
|
/// given chunk of data is binary or not, and then taking an action based on
|
||||||
|
/// the result of that heuristic. The motivation behind detecting binary data
|
||||||
|
/// is that binary data often indicates data that is undesirable to search
|
||||||
|
/// using textual patterns. Of course, there are many cases in which this isn't
|
||||||
|
/// true, which is why binary detection is disabled by default.
|
||||||
|
///
|
||||||
|
/// Unfortunately, binary detection works differently depending on the type of
|
||||||
|
/// search being executed:
|
||||||
|
///
|
||||||
|
/// 1. When performing a search using a fixed size buffer, binary detection is
|
||||||
|
/// applied to the buffer's contents as it is filled. Binary detection must
|
||||||
|
/// be applied to the buffer directly because binary files may not contain
|
||||||
|
/// line terminators, which could result in exorbitant memory usage.
|
||||||
|
/// 2. When performing a search using memory maps or by reading data off the
|
||||||
|
/// heap, then binary detection is only guaranteed to be applied to the
|
||||||
|
/// parts corresponding to a match. When `Quit` is enabled, then the first
|
||||||
|
/// few KB of the data are searched for binary data.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
pub struct BinaryDetection(line_buffer::BinaryDetection);
|
||||||
|
|
||||||
|
impl BinaryDetection {
|
||||||
|
/// No binary detection is performed. Data reported by the searcher may
|
||||||
|
/// contain arbitrary bytes.
|
||||||
|
///
|
||||||
|
/// This is the default.
|
||||||
|
pub fn none() -> BinaryDetection {
|
||||||
|
BinaryDetection(line_buffer::BinaryDetection::None)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Binary detection is performed by looking for the given byte.
|
||||||
|
///
|
||||||
|
/// When searching is performed using a fixed size buffer, then the
|
||||||
|
/// contents of that buffer are always searched for the presence of this
|
||||||
|
/// byte. If it is found, then the underlying data is considered binary
|
||||||
|
/// and the search stops as if it reached EOF.
|
||||||
|
///
|
||||||
|
/// When searching is performed with the entire contents mapped into
|
||||||
|
/// memory, then binary detection is more conservative. Namely, only a
|
||||||
|
/// fixed sized region at the beginning of the contents are detected for
|
||||||
|
/// binary data. As a compromise, any subsequent matching (or context)
|
||||||
|
/// lines are also searched for binary data. If binary data is detected at
|
||||||
|
/// any point, then the search stops as if it reached EOF.
|
||||||
|
pub fn quit(binary_byte: u8) -> BinaryDetection {
|
||||||
|
BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte))
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO(burntsushi): Figure out how to make binary conversion work. This
|
||||||
|
// permits implementing GNU grep's default behavior, which is to zap NUL
|
||||||
|
// bytes but still execute a search (if a match is detected, then GNU grep
|
||||||
|
// stops and reports that a match was found but doesn't print the matching
|
||||||
|
// line itself).
|
||||||
|
//
|
||||||
|
// This behavior is pretty simple to implement using the line buffer (and
|
||||||
|
// in fact, it is already implemented and tested), since there's a fixed
|
||||||
|
// size buffer that we can easily write to. The issue arises when searching
|
||||||
|
// a `&[u8]` (whether on the heap or via a memory map), since this isn't
|
||||||
|
// something we can easily write to.
|
||||||
|
|
||||||
|
/// The given byte is searched in all contents read by the line buffer. If
|
||||||
|
/// it occurs, then it is replaced by the line terminator. The line buffer
|
||||||
|
/// guarantees that this byte will never be observable by callers.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
fn convert(binary_byte: u8) -> BinaryDetection {
|
||||||
|
BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An encoding to use when searching.
|
||||||
|
///
|
||||||
|
/// An encoding can be used to configure a
|
||||||
|
/// [`SearcherBuilder`](struct.SearchBuilder.html)
|
||||||
|
/// to transcode source data from an encoding to UTF-8 before searching.
|
||||||
|
///
|
||||||
|
/// An `Encoding` will always be cheap to clone.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Encoding(&'static encoding_rs::Encoding);
|
||||||
|
|
||||||
|
impl Encoding {
|
||||||
|
/// Create a new encoding for the specified label.
|
||||||
|
///
|
||||||
|
/// The encoding label provided is mapped to an encoding via the set of
|
||||||
|
/// available choices specified in the
|
||||||
|
/// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get).
|
||||||
|
/// If the given label does not correspond to a valid encoding, then this
|
||||||
|
/// returns an error.
|
||||||
|
pub fn new(label: &str) -> Result<Encoding, ConfigError> {
|
||||||
|
let label = label.as_bytes();
|
||||||
|
match encoding_rs::Encoding::for_label_no_replacement(label) {
|
||||||
|
Some(encoding) => Ok(Encoding(encoding)),
|
||||||
|
None => {
|
||||||
|
Err(ConfigError::UnknownEncoding { label: label.to_vec() })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The internal configuration of a searcher. This is shared among several
|
||||||
|
/// search related types, but is only ever written to by the SearcherBuilder.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Config {
|
||||||
|
/// The line terminator to use.
|
||||||
|
line_term: LineTerminator,
|
||||||
|
/// Whether to invert matching.
|
||||||
|
invert_match: bool,
|
||||||
|
/// The number of lines after a match to include.
|
||||||
|
after_context: usize,
|
||||||
|
/// The number of lines before a match to include.
|
||||||
|
before_context: usize,
|
||||||
|
/// Whether to enable unbounded context or not.
|
||||||
|
passthru: bool,
|
||||||
|
/// Whether to count line numbers.
|
||||||
|
line_number: bool,
|
||||||
|
/// The maximum amount of heap memory to use.
|
||||||
|
///
|
||||||
|
/// When not given, no explicit limit is enforced. When set to `0`, then
|
||||||
|
/// only the memory map search strategy is available.
|
||||||
|
heap_limit: Option<usize>,
|
||||||
|
/// The memory map strategy.
|
||||||
|
mmap: MmapChoice,
|
||||||
|
/// The binary data detection strategy.
|
||||||
|
binary: BinaryDetection,
|
||||||
|
/// Whether to enable matching across multiple lines.
|
||||||
|
multi_line: bool,
|
||||||
|
/// An encoding that, when present, causes the searcher to transcode all
|
||||||
|
/// input from the encoding to UTF-8.
|
||||||
|
encoding: Option<Encoding>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Config {
|
||||||
|
fn default() -> Config {
|
||||||
|
Config {
|
||||||
|
line_term: LineTerminator::default(),
|
||||||
|
invert_match: false,
|
||||||
|
after_context: 0,
|
||||||
|
before_context: 0,
|
||||||
|
passthru: false,
|
||||||
|
line_number: true,
|
||||||
|
heap_limit: None,
|
||||||
|
mmap: MmapChoice::default(),
|
||||||
|
binary: BinaryDetection::default(),
|
||||||
|
multi_line: false,
|
||||||
|
encoding: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Config {
|
||||||
|
/// Return the maximal amount of lines needed to fulfill this
|
||||||
|
/// configuration's context.
|
||||||
|
///
|
||||||
|
/// If this returns `0`, then no context is ever needed.
|
||||||
|
fn max_context(&self) -> usize {
|
||||||
|
cmp::max(self.before_context, self.after_context)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a line buffer from this configuration.
|
||||||
|
fn line_buffer(&self) -> LineBuffer {
|
||||||
|
let mut builder = LineBufferBuilder::new();
|
||||||
|
builder
|
||||||
|
.line_terminator(self.line_term.as_byte())
|
||||||
|
.binary_detection(self.binary.0);
|
||||||
|
|
||||||
|
if let Some(limit) = self.heap_limit {
|
||||||
|
let (capacity, additional) =
|
||||||
|
if limit <= DEFAULT_BUFFER_CAPACITY {
|
||||||
|
(limit, 0)
|
||||||
|
} else {
|
||||||
|
(DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY)
|
||||||
|
};
|
||||||
|
builder
|
||||||
|
.capacity(capacity)
|
||||||
|
.buffer_alloc(BufferAllocation::Error(additional));
|
||||||
|
}
|
||||||
|
builder.build()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An error that can occur when building a searcher.
|
||||||
|
///
|
||||||
|
/// This error occurs when a non-sensical configuration is present when trying
|
||||||
|
/// to construct a `Searcher` from a `SearcherBuilder`.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
pub enum ConfigError {
|
||||||
|
/// Indicates that the heap limit configuration prevents all possible
|
||||||
|
/// search strategies from being used. For example, if the heap limit is
|
||||||
|
/// set to 0 and memory map searching is disabled or unavailable.
|
||||||
|
SearchUnavailable,
|
||||||
|
/// Occurs when a matcher reports a line terminator that is different than
|
||||||
|
/// the one configured in the searcher.
|
||||||
|
MismatchedLineTerminators {
|
||||||
|
/// The matcher's line terminator.
|
||||||
|
matcher: LineTerminator,
|
||||||
|
/// The searcher's line terminator.
|
||||||
|
searcher: LineTerminator,
|
||||||
|
},
|
||||||
|
/// Occurs when no encoding could be found for a particular label.
|
||||||
|
UnknownEncoding {
|
||||||
|
/// The provided encoding label that could not be found.
|
||||||
|
label: Vec<u8>,
|
||||||
|
},
|
||||||
|
/// Hints that destructuring should not be exhaustive.
|
||||||
|
///
|
||||||
|
/// This enum may grow additional variants, so this makes sure clients
|
||||||
|
/// don't count on exhaustive matching. (Otherwise, adding a new variant
|
||||||
|
/// could break existing code.)
|
||||||
|
#[doc(hidden)]
|
||||||
|
__Nonexhaustive,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ::std::error::Error for ConfigError {
|
||||||
|
fn description(&self) -> &str { "grep-searcher configuration error" }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for ConfigError {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
match *self {
|
||||||
|
ConfigError::SearchUnavailable => {
|
||||||
|
write!(f, "grep config error: no available searchers")
|
||||||
|
}
|
||||||
|
ConfigError::MismatchedLineTerminators { matcher, searcher } => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"grep config error: mismatched line terminators, \
|
||||||
|
matcher has {:?} but searcher has {:?}",
|
||||||
|
matcher,
|
||||||
|
searcher
|
||||||
|
)
|
||||||
|
}
|
||||||
|
ConfigError::UnknownEncoding { ref label } => {
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"grep config error: unknown encoding: {}",
|
||||||
|
String::from_utf8_lossy(label),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
_ => panic!("BUG: unexpected variant found"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A builder for configuring a searcher.
|
||||||
|
///
|
||||||
|
/// A search builder permits specifying the configuration of a searcher,
|
||||||
|
/// including options like whether to invert the search or to enable multi
|
||||||
|
/// line search.
|
||||||
|
///
|
||||||
|
/// Once a searcher has been built, it is beneficial to reuse that searcher
|
||||||
|
/// for multiple searches, if possible.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SearcherBuilder {
|
||||||
|
config: Config,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for SearcherBuilder {
|
||||||
|
fn default() -> SearcherBuilder {
|
||||||
|
SearcherBuilder::new()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearcherBuilder {
|
||||||
|
/// Create a new searcher builder with a default configuration.
|
||||||
|
pub fn new() -> SearcherBuilder {
|
||||||
|
SearcherBuilder {
|
||||||
|
config: Config::default(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build a searcher with the given matcher.
|
||||||
|
pub fn build(&self) -> Searcher {
|
||||||
|
let mut config = self.config.clone();
|
||||||
|
if config.passthru {
|
||||||
|
config.before_context = 0;
|
||||||
|
config.after_context = 0;
|
||||||
|
}
|
||||||
|
let mut decode_builder = DecodeReaderBytesBuilder::new();
|
||||||
|
decode_builder
|
||||||
|
.encoding(self.config.encoding.as_ref().map(|e| e.0))
|
||||||
|
.utf8_passthru(true)
|
||||||
|
.strip_bom(true)
|
||||||
|
.bom_override(true);
|
||||||
|
Searcher {
|
||||||
|
config: config,
|
||||||
|
decode_builder: decode_builder,
|
||||||
|
decode_buffer: RefCell::new(vec![0; 8 * (1<<10)]),
|
||||||
|
line_buffer: RefCell::new(self.config.line_buffer()),
|
||||||
|
multi_line_buffer: RefCell::new(vec![]),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the line terminator that is used by the searcher.
|
||||||
|
///
|
||||||
|
/// When using a searcher, if the matcher provided has a line terminator
|
||||||
|
/// set, then it must be the same as this one. If they aren't, building
|
||||||
|
/// a searcher will return an error.
|
||||||
|
///
|
||||||
|
/// By default, this is set to `b'\n'`.
|
||||||
|
pub fn line_terminator(
|
||||||
|
&mut self,
|
||||||
|
line_term: LineTerminator,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.line_term = line_term;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to invert matching, whereby lines that don't match are reported
|
||||||
|
/// instead of reporting lines that do match.
|
||||||
|
///
|
||||||
|
/// By default, this is disabled.
|
||||||
|
pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder {
|
||||||
|
self.config.invert_match = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to count and include line numbers with matching lines.
|
||||||
|
///
|
||||||
|
/// This is enabled by default. There is a small performance penalty
|
||||||
|
/// associated with computing line numbers, so this can be disabled when
|
||||||
|
/// this isn't desirable.
|
||||||
|
pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder {
|
||||||
|
self.config.line_number = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable multi line search or not.
|
||||||
|
///
|
||||||
|
/// When multi line search is enabled, matches *may* match across multiple
|
||||||
|
/// lines. Conversely, when multi line search is disabled, it is impossible
|
||||||
|
/// for any match to span more than one line.
|
||||||
|
///
|
||||||
|
/// **Warning:** multi line search requires having the entire contents to
|
||||||
|
/// search mapped in memory at once. When searching files, memory maps
|
||||||
|
/// will be used if possible and if they are enabled, which avoids using
|
||||||
|
/// your program's heap. However, if memory maps cannot be used (e.g.,
|
||||||
|
/// for searching streams like `stdin` or if transcoding is necessary),
|
||||||
|
/// then the entire contents of the stream are read on to the heap before
|
||||||
|
/// starting the search.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder {
|
||||||
|
self.config.multi_line = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to include a fixed number of lines after every match.
|
||||||
|
///
|
||||||
|
/// When this is set to a non-zero number, then the searcher will report
|
||||||
|
/// `line_count` contextual lines after every match.
|
||||||
|
///
|
||||||
|
/// This is set to `0` by default.
|
||||||
|
pub fn after_context(
|
||||||
|
&mut self,
|
||||||
|
line_count: usize,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.after_context = line_count;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to include a fixed number of lines before every match.
|
||||||
|
///
|
||||||
|
/// When this is set to a non-zero number, then the searcher will report
|
||||||
|
/// `line_count` contextual lines before every match.
|
||||||
|
///
|
||||||
|
/// This is set to `0` by default.
|
||||||
|
pub fn before_context(
|
||||||
|
&mut self,
|
||||||
|
line_count: usize,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.before_context = line_count;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable the "passthru" feature or not.
|
||||||
|
///
|
||||||
|
/// When passthru is enabled, it effectively treats all non-matching lines
|
||||||
|
/// as contextual lines. In other words, enabling this is akin to
|
||||||
|
/// requesting an unbounded number of before and after contextual lines.
|
||||||
|
///
|
||||||
|
/// When passthru mode is enabled, any `before_context` or `after_context`
|
||||||
|
/// settings are ignored by setting them to `0`.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder {
|
||||||
|
self.config.passthru = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set an approximate limit on the amount of heap space used by a
|
||||||
|
/// searcher.
|
||||||
|
///
|
||||||
|
/// The heap limit is enforced in two scenarios:
|
||||||
|
///
|
||||||
|
/// * When searching using a fixed size buffer, the heap limit controls
|
||||||
|
/// how big this buffer is allowed to be. Assuming contexts are disabled,
|
||||||
|
/// the minimum size of this buffer is the length (in bytes) of the
|
||||||
|
/// largest single line in the contents being searched. If any line
|
||||||
|
/// exceeds the heap limit, then an error will be returned.
|
||||||
|
/// * When performing a multi line search, a fixed size buffer cannot be
|
||||||
|
/// used. Thus, the only choices are to read the entire contents on to
|
||||||
|
/// the heap, or use memory maps. In the former case, the heap limit set
|
||||||
|
/// here is enforced.
|
||||||
|
///
|
||||||
|
/// If a heap limit is set to `0`, then no heap space is used. If there are
|
||||||
|
/// no alternative strategies available for searching without heap space
|
||||||
|
/// (e.g., memory maps are disabled), then the searcher wil return an error
|
||||||
|
/// immediately.
|
||||||
|
///
|
||||||
|
/// By default, no limit is set.
|
||||||
|
pub fn heap_limit(
|
||||||
|
&mut self,
|
||||||
|
bytes: Option<usize>,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.heap_limit = bytes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the strategy to employ use of memory maps.
|
||||||
|
///
|
||||||
|
/// Currently, there are only two strategies that can be employed:
|
||||||
|
///
|
||||||
|
/// * **Automatic** - A searcher will use heuristics, including but not
|
||||||
|
/// limited to file size and platform, to determine whether to use memory
|
||||||
|
/// maps or not.
|
||||||
|
/// * **Never** - Memory maps will never be used. If multi line search is
|
||||||
|
/// enabled, then the entire contents will be read on to the heap before
|
||||||
|
/// searching begins.
|
||||||
|
///
|
||||||
|
/// The default behavior is **never**. Generally speaking, and perhaps
|
||||||
|
/// against conventional wisdom, memory maps don't necessarily enable
|
||||||
|
/// faster searching. For example, depending on the platform, using memory
|
||||||
|
/// maps while searching a large directory can actually be quite a bit
|
||||||
|
/// slower than using normal read calls because of the overhead of managing
|
||||||
|
/// the memory maps.
|
||||||
|
///
|
||||||
|
/// Memory maps can be faster in some cases however. On some platforms,
|
||||||
|
/// when searching a very large file that *is already in memory*, it can
|
||||||
|
/// be slightly faster to search it as a memory map instead of using
|
||||||
|
/// normal read calls.
|
||||||
|
///
|
||||||
|
/// Finally, memory maps have a somewhat complicated safety story in Rust.
|
||||||
|
/// If you aren't sure whether enabling memory maps is worth it, then just
|
||||||
|
/// don't bother with it.
|
||||||
|
///
|
||||||
|
/// **WARNING**: If your process is searching a file backed memory map
|
||||||
|
/// at the same time that file is truncated, then it's possible for the
|
||||||
|
/// process to terminate with a bus error.
|
||||||
|
pub fn memory_map(
|
||||||
|
&mut self,
|
||||||
|
strategy: MmapChoice,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.mmap = strategy;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the binary detection strategy.
|
||||||
|
///
|
||||||
|
/// The binary detection strategy determines not only how the searcher
|
||||||
|
/// detects binary data, but how it responds to the presence of binary
|
||||||
|
/// data. See the [`BinaryDetection`](struct.BinaryDetection.html) type
|
||||||
|
/// for more information.
|
||||||
|
///
|
||||||
|
/// By default, binary detection is disabled.
|
||||||
|
pub fn binary_detection(
|
||||||
|
&mut self,
|
||||||
|
detection: BinaryDetection,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.binary = detection;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the encoding used to read the source data before searching.
|
||||||
|
///
|
||||||
|
/// When an encoding is provided, then the source data is _unconditionally_
|
||||||
|
/// transcoded using the encoding, unless a BOM is present. If a BOM is
|
||||||
|
/// present, then the encoding indicated by the BOM is used instead. If the
|
||||||
|
/// transcoding process encounters an error, then bytes are replaced with
|
||||||
|
/// the Unicode replacement codepoint.
|
||||||
|
///
|
||||||
|
/// When no encoding is specified (the default), then BOM sniffing is used
|
||||||
|
/// to determine whether the source data is UTF-8 or UTF-16, and
|
||||||
|
/// transcoding will be performed automatically. If no BOM could be found,
|
||||||
|
/// then the source data is searched _as if_ it were UTF-8. However, so
|
||||||
|
/// long as the source data is at least ASCII compatible, then it is
|
||||||
|
/// possible for a search to produce useful results.
|
||||||
|
pub fn encoding(
|
||||||
|
&mut self,
|
||||||
|
encoding: Option<Encoding>,
|
||||||
|
) -> &mut SearcherBuilder {
|
||||||
|
self.config.encoding = encoding;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A searcher executes searches over a haystack and writes results to a caller
|
||||||
|
/// provided sink.
|
||||||
|
///
|
||||||
|
/// Matches are detected via implementations of the `Matcher` trait, which must
|
||||||
|
/// be provided by the caller when executing a search.
|
||||||
|
///
|
||||||
|
/// When possible, a searcher should be reused.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Searcher {
|
||||||
|
/// The configuration for this searcher.
|
||||||
|
///
|
||||||
|
/// We make most of these settings available to users of `Searcher` via
|
||||||
|
/// public API methods, which can be queried in implementations of `Sink`
|
||||||
|
/// if necessary.
|
||||||
|
config: Config,
|
||||||
|
/// A builder for constructing a streaming reader that transcodes source
|
||||||
|
/// data according to either an explicitly specified encoding or via an
|
||||||
|
/// automatically detected encoding via BOM sniffing.
|
||||||
|
///
|
||||||
|
/// When no transcoding is needed, then the transcoder built will pass
|
||||||
|
/// through the underlying bytes with no additional overhead.
|
||||||
|
decode_builder: DecodeReaderBytesBuilder,
|
||||||
|
/// A buffer that is used for transcoding scratch space.
|
||||||
|
decode_buffer: RefCell<Vec<u8>>,
|
||||||
|
/// A line buffer for use in line oriented searching.
|
||||||
|
///
|
||||||
|
/// We wrap it in a RefCell to permit lending out borrows of `Searcher`
|
||||||
|
/// to sinks. We still require a mutable borrow to execute a search, so
|
||||||
|
/// we statically prevent callers from causing RefCell to panic at runtime
|
||||||
|
/// due to a borrowing violation.
|
||||||
|
line_buffer: RefCell<LineBuffer>,
|
||||||
|
/// A buffer in which to store the contents of a reader when performing a
|
||||||
|
/// multi line search. In particular, multi line searches cannot be
|
||||||
|
/// performed incrementally, and need the entire haystack in memory at
|
||||||
|
/// once.
|
||||||
|
multi_line_buffer: RefCell<Vec<u8>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Searcher {
|
||||||
|
/// Create a new searcher with a default configuration.
|
||||||
|
///
|
||||||
|
/// To configure the searcher (e.g., invert matching, enable memory maps,
|
||||||
|
/// enable contexts, etc.), use the
|
||||||
|
/// [`SearcherBuilder`](struct.SearcherBuilder.html).
|
||||||
|
pub fn new() -> Searcher {
|
||||||
|
SearcherBuilder::new().build()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a search over the file with the given path and write the
|
||||||
|
/// results to the given sink.
|
||||||
|
///
|
||||||
|
/// If memory maps are enabled and the searcher heuristically believes
|
||||||
|
/// memory maps will help the search run faster, then this will use
|
||||||
|
/// memory maps. For this reason, callers should prefer using this method
|
||||||
|
/// or `search_file` over the more generic `search_reader` when possible.
|
||||||
|
pub fn search_path<P, M, S>(
|
||||||
|
&mut self,
|
||||||
|
matcher: M,
|
||||||
|
path: P,
|
||||||
|
write_to: S,
|
||||||
|
) -> Result<(), S::Error>
|
||||||
|
where P: AsRef<Path>,
|
||||||
|
M: Matcher,
|
||||||
|
S: Sink,
|
||||||
|
{
|
||||||
|
let path = path.as_ref();
|
||||||
|
let file = File::open(path).map_err(S::Error::error_io)?;
|
||||||
|
self.search_file_maybe_path(matcher, Some(path), &file, write_to)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a search over a file and write the results to the given sink.
|
||||||
|
///
|
||||||
|
/// If memory maps are enabled and the searcher heuristically believes
|
||||||
|
/// memory maps will help the search run faster, then this will use
|
||||||
|
/// memory maps. For this reason, callers should prefer using this method
|
||||||
|
/// or `search_path` over the more generic `search_reader` when possible.
|
||||||
|
pub fn search_file<M, S>(
|
||||||
|
&mut self,
|
||||||
|
matcher: M,
|
||||||
|
file: &File,
|
||||||
|
write_to: S,
|
||||||
|
) -> Result<(), S::Error>
|
||||||
|
where M: Matcher,
|
||||||
|
S: Sink,
|
||||||
|
{
|
||||||
|
self.search_file_maybe_path(matcher, None, file, write_to)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn search_file_maybe_path<M, S>(
|
||||||
|
&mut self,
|
||||||
|
matcher: M,
|
||||||
|
path: Option<&Path>,
|
||||||
|
file: &File,
|
||||||
|
write_to: S,
|
||||||
|
) -> Result<(), S::Error>
|
||||||
|
where M: Matcher,
|
||||||
|
S: Sink,
|
||||||
|
{
|
||||||
|
if let Some(mmap) = self.config.mmap.open(file, path) {
|
||||||
|
trace!("{:?}: searching via memory map", path);
|
||||||
|
return self.search_slice(matcher, &mmap, write_to);
|
||||||
|
}
|
||||||
|
// Fast path for multi-line searches of files when memory maps are
|
||||||
|
// not enabled. This pre-allocates a buffer roughly the size of the
|
||||||
|
// file, which isn't possible when searching an arbitrary io::Read.
|
||||||
|
if self.multi_line_with_matcher(&matcher) {
|
||||||
|
trace!("{:?}: reading entire file on to heap for mulitline", path);
|
||||||
|
self.fill_multi_line_buffer_from_file::<S>(file)?;
|
||||||
|
trace!("{:?}: searching via multiline strategy", path);
|
||||||
|
MultiLine::new(
|
||||||
|
self,
|
||||||
|
matcher,
|
||||||
|
&*self.multi_line_buffer.borrow(),
|
||||||
|
write_to,
|
||||||
|
).run()
|
||||||
|
} else {
|
||||||
|
trace!("{:?}: searching using generic reader", path);
|
||||||
|
self.search_reader(matcher, file, write_to)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a search over any implementation of `io::Read` and write the
|
||||||
|
/// results to the given sink.
|
||||||
|
///
|
||||||
|
/// When possible, this implementation will search the reader incrementally
|
||||||
|
/// without reading it into memory. In some cases---for example, if multi
|
||||||
|
/// line search is enabled---an incremental search isn't possible and the
|
||||||
|
/// given reader is consumed completely and placed on the heap before
|
||||||
|
/// searching begins. For this reason, when multi line search is enabled,
|
||||||
|
/// one should try to use higher level APIs (e.g., searching by file or
|
||||||
|
/// file path) so that memory maps can be used if they are available and
|
||||||
|
/// enabled.
|
||||||
|
pub fn search_reader<M, R, S>(
|
||||||
|
&mut self,
|
||||||
|
matcher: M,
|
||||||
|
read_from: R,
|
||||||
|
write_to: S,
|
||||||
|
) -> Result<(), S::Error>
|
||||||
|
where M: Matcher,
|
||||||
|
R: io::Read,
|
||||||
|
S: Sink,
|
||||||
|
{
|
||||||
|
self.check_config(&matcher).map_err(S::Error::error_config)?;
|
||||||
|
|
||||||
|
let mut decode_buffer = self.decode_buffer.borrow_mut();
|
||||||
|
let read_from = self.decode_builder
|
||||||
|
.build_with_buffer(read_from, &mut *decode_buffer)
|
||||||
|
.map_err(S::Error::error_io)?;
|
||||||
|
|
||||||
|
if self.multi_line_with_matcher(&matcher) {
|
||||||
|
trace!("generic reader: reading everything to heap for multiline");
|
||||||
|
self.fill_multi_line_buffer_from_reader::<_, S>(read_from)?;
|
||||||
|
trace!("generic reader: searching via multiline strategy");
|
||||||
|
MultiLine::new(
|
||||||
|
self,
|
||||||
|
matcher,
|
||||||
|
&*self.multi_line_buffer.borrow(),
|
||||||
|
write_to,
|
||||||
|
).run()
|
||||||
|
} else {
|
||||||
|
let mut line_buffer = self.line_buffer.borrow_mut();
|
||||||
|
let rdr = LineBufferReader::new(read_from, &mut *line_buffer);
|
||||||
|
trace!("generic reader: searching via roll buffer strategy");
|
||||||
|
ReadByLine::new(self, matcher, rdr, write_to).run()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a search over the given slice and write the results to the
|
||||||
|
/// given sink.
|
||||||
|
pub fn search_slice<M, S>(
|
||||||
|
&mut self,
|
||||||
|
matcher: M,
|
||||||
|
slice: &[u8],
|
||||||
|
write_to: S,
|
||||||
|
) -> Result<(), S::Error>
|
||||||
|
where M: Matcher,
|
||||||
|
S: Sink,
|
||||||
|
{
|
||||||
|
self.check_config(&matcher).map_err(S::Error::error_config)?;
|
||||||
|
|
||||||
|
// We can search the slice directly, unless we need to do transcoding.
|
||||||
|
if self.slice_needs_transcoding(slice) {
|
||||||
|
trace!("slice reader: needs transcoding, using generic reader");
|
||||||
|
return self.search_reader(matcher, slice, write_to);
|
||||||
|
}
|
||||||
|
if self.multi_line_with_matcher(&matcher) {
|
||||||
|
trace!("slice reader: searching via multiline strategy");
|
||||||
|
MultiLine::new(self, matcher, slice, write_to).run()
|
||||||
|
} else {
|
||||||
|
trace!("slice reader: searching via slice-by-line strategy");
|
||||||
|
SliceByLine::new(self, matcher, slice, write_to).run()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check that the searcher's configuration and the matcher are consistent
|
||||||
|
/// with each other.
|
||||||
|
fn check_config<M: Matcher>(&self, matcher: M) -> Result<(), ConfigError> {
|
||||||
|
if self.config.heap_limit == Some(0)
|
||||||
|
&& !self.config.mmap.is_enabled()
|
||||||
|
{
|
||||||
|
return Err(ConfigError::SearchUnavailable);
|
||||||
|
}
|
||||||
|
let matcher_line_term = match matcher.line_terminator() {
|
||||||
|
None => return Ok(()),
|
||||||
|
Some(line_term) => line_term,
|
||||||
|
};
|
||||||
|
if matcher_line_term != self.config.line_term {
|
||||||
|
return Err(ConfigError::MismatchedLineTerminators {
|
||||||
|
matcher: matcher_line_term,
|
||||||
|
searcher: self.config.line_term,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the given slice needs to be transcoded.
|
||||||
|
fn slice_needs_transcoding(&self, slice: &[u8]) -> bool {
|
||||||
|
self.config.encoding.is_some() || slice_has_utf16_bom(slice)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The following methods permit querying the configuration of a searcher.
|
||||||
|
/// These can be useful in generic implementations of
|
||||||
|
/// [`Sink`](trait.Sink.html),
|
||||||
|
/// where the output may be tailored based on how the searcher is configured.
|
||||||
|
impl Searcher {
|
||||||
|
/// Returns the line terminator used by this searcher.
|
||||||
|
#[inline]
|
||||||
|
pub fn line_terminator(&self) -> LineTerminator {
|
||||||
|
self.config.line_term
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if this searcher is configured to invert its
|
||||||
|
/// search results. That is, matching lines are lines that do **not** match
|
||||||
|
/// the searcher's matcher.
|
||||||
|
#[inline]
|
||||||
|
pub fn invert_match(&self) -> bool {
|
||||||
|
self.config.invert_match
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if this searcher is configured to count line
|
||||||
|
/// numbers.
|
||||||
|
#[inline]
|
||||||
|
pub fn line_number(&self) -> bool {
|
||||||
|
self.config.line_number
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if this searcher is configured to perform
|
||||||
|
/// multi line search.
|
||||||
|
#[inline]
|
||||||
|
pub fn multi_line(&self) -> bool {
|
||||||
|
self.config.multi_line
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if this searcher will choose a multi-line
|
||||||
|
/// strategy given the provided matcher.
|
||||||
|
///
|
||||||
|
/// This may diverge from the result of `multi_line` in cases where the
|
||||||
|
/// searcher has been configured to execute a search that can report
|
||||||
|
/// matches over multiple lines, but where the matcher guarantees that it
|
||||||
|
/// will never produce a match over multiple lines.
|
||||||
|
pub fn multi_line_with_matcher<M: Matcher>(&self, matcher: M) -> bool {
|
||||||
|
if !self.multi_line() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if let Some(line_term) = matcher.line_terminator() {
|
||||||
|
if line_term == self.line_terminator() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(non_matching) = matcher.non_matching_bytes() {
|
||||||
|
// If the line terminator is CRLF, we don't actually need to care
|
||||||
|
// whether the regex can match `\r` or not. Namely, a `\r` is
|
||||||
|
// neither necessary nor sufficient to terminate a line. A `\n` is
|
||||||
|
// always required.
|
||||||
|
if non_matching.contains(self.line_terminator().as_byte()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of "after" context lines to report. When context
|
||||||
|
/// reporting is not enabled, this returns `0`.
|
||||||
|
#[inline]
|
||||||
|
pub fn after_context(&self) -> usize {
|
||||||
|
self.config.after_context
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of "before" context lines to report. When context
|
||||||
|
/// reporting is not enabled, this returns `0`.
|
||||||
|
#[inline]
|
||||||
|
pub fn before_context(&self) -> usize {
|
||||||
|
self.config.before_context
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the searcher has "passthru" mode enabled.
|
||||||
|
#[inline]
|
||||||
|
pub fn passthru(&self) -> bool {
|
||||||
|
self.config.passthru
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill the buffer for use with multi-line searching from the given file.
|
||||||
|
/// This reads from the file until EOF or until an error occurs. If the
|
||||||
|
/// contents exceed the configured heap limit, then an error is returned.
|
||||||
|
fn fill_multi_line_buffer_from_file<S: Sink>(
|
||||||
|
&self,
|
||||||
|
file: &File,
|
||||||
|
) -> Result<(), S::Error> {
|
||||||
|
assert!(self.config.multi_line);
|
||||||
|
|
||||||
|
let mut decode_buffer = self.decode_buffer.borrow_mut();
|
||||||
|
let mut read_from = self.decode_builder
|
||||||
|
.build_with_buffer(file, &mut *decode_buffer)
|
||||||
|
.map_err(S::Error::error_io)?;
|
||||||
|
|
||||||
|
// If we don't have a heap limit, then we can defer to std's
|
||||||
|
// read_to_end implementation. fill_multi_line_buffer_from_reader will
|
||||||
|
// do this too, but since we have a File, we can be a bit smarter about
|
||||||
|
// pre-allocating here.
|
||||||
|
//
|
||||||
|
// If we're transcoding, then our pre-allocation might not be exact,
|
||||||
|
// but is probably still better than nothing.
|
||||||
|
if self.config.heap_limit.is_none() {
|
||||||
|
let mut buf = self.multi_line_buffer.borrow_mut();
|
||||||
|
buf.clear();
|
||||||
|
let cap = file
|
||||||
|
.metadata()
|
||||||
|
.map(|m| m.len() as usize + 1)
|
||||||
|
.unwrap_or(0);
|
||||||
|
buf.reserve(cap);
|
||||||
|
read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
self.fill_multi_line_buffer_from_reader::<_, S>(read_from)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fill the buffer for use with multi-line searching from the given
|
||||||
|
/// reader. This reads from the reader until EOF or until an error occurs.
|
||||||
|
/// If the contents exceed the configured heap limit, then an error is
|
||||||
|
/// returned.
|
||||||
|
fn fill_multi_line_buffer_from_reader<R: io::Read, S: Sink>(
|
||||||
|
&self,
|
||||||
|
mut read_from: R,
|
||||||
|
) -> Result<(), S::Error> {
|
||||||
|
assert!(self.config.multi_line);
|
||||||
|
|
||||||
|
let mut buf = self.multi_line_buffer.borrow_mut();
|
||||||
|
buf.clear();
|
||||||
|
|
||||||
|
// If we don't have a heap limit, then we can defer to std's
|
||||||
|
// read_to_end implementation...
|
||||||
|
let heap_limit = match self.config.heap_limit {
|
||||||
|
Some(heap_limit) => heap_limit,
|
||||||
|
None => {
|
||||||
|
read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if heap_limit == 0 {
|
||||||
|
return Err(S::Error::error_io(alloc_error(heap_limit)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... otherwise we need to roll our own. This is likely quite a bit
|
||||||
|
// slower than what is optimal, but we avoid worry about memory safety
|
||||||
|
// until there's a compelling reason to speed this up.
|
||||||
|
buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0);
|
||||||
|
let mut pos = 0;
|
||||||
|
loop {
|
||||||
|
let nread = match read_from.read(&mut buf[pos..]) {
|
||||||
|
Ok(nread) => nread,
|
||||||
|
Err(ref err) if err.kind() == io::ErrorKind::Interrupted => {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
Err(err) => return Err(S::Error::error_io(err)),
|
||||||
|
};
|
||||||
|
if nread == 0 {
|
||||||
|
buf.resize(pos, 0);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += nread;
|
||||||
|
if buf[pos..].is_empty() {
|
||||||
|
let additional = heap_limit - buf.len();
|
||||||
|
if additional == 0 {
|
||||||
|
return Err(S::Error::error_io(alloc_error(heap_limit)));
|
||||||
|
}
|
||||||
|
let limit = buf.len() + additional;
|
||||||
|
let doubled = 2 * buf.len();
|
||||||
|
buf.resize(cmp::min(doubled, limit), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the given slice begins with a UTF-16 BOM.
|
||||||
|
///
|
||||||
|
/// This is used by the searcher to determine if a transcoder is necessary.
|
||||||
|
/// Otherwise, it is advantageous to search the slice directly.
|
||||||
|
fn slice_has_utf16_bom(slice: &[u8]) -> bool {
|
||||||
|
let enc = match encoding_rs::Encoding::for_bom(slice) {
|
||||||
|
None => return false,
|
||||||
|
Some((enc, _)) => enc,
|
||||||
|
};
|
||||||
|
[encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use testutil::{KitchenSink, RegexMatcher};
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_error_heap_limit() {
|
||||||
|
let matcher = RegexMatcher::new("");
|
||||||
|
let sink = KitchenSink::new();
|
||||||
|
let mut searcher = SearcherBuilder::new()
|
||||||
|
.heap_limit(Some(0))
|
||||||
|
.build();
|
||||||
|
let res = searcher.search_slice(matcher, &[], sink);
|
||||||
|
assert!(res.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn config_error_line_terminator() {
|
||||||
|
let mut matcher = RegexMatcher::new("");
|
||||||
|
matcher.set_line_term(Some(LineTerminator::byte(b'z')));
|
||||||
|
|
||||||
|
let sink = KitchenSink::new();
|
||||||
|
let mut searcher = Searcher::new();
|
||||||
|
let res = searcher.search_slice(matcher, &[], sink);
|
||||||
|
assert!(res.is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
606
grep-searcher/src/sink.rs
Normal file
606
grep-searcher/src/sink.rs
Normal file
@@ -0,0 +1,606 @@
|
|||||||
|
use std::fmt;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
|
use grep_matcher::LineTerminator;
|
||||||
|
|
||||||
|
use lines::LineIter;
|
||||||
|
use searcher::{ConfigError, Searcher};
|
||||||
|
|
||||||
|
/// A trait that describes errors that can be reported by searchers and
|
||||||
|
/// implementations of `Sink`.
|
||||||
|
///
|
||||||
|
/// Unless you have a specialized use case, you probably don't need to
|
||||||
|
/// implement this trait explicitly. It's likely that using `io::Error` (which
|
||||||
|
/// implements this trait) for your error type is good enough, largely because
|
||||||
|
/// most errors that occur during search will likely be an `io::Error`.
|
||||||
|
pub trait SinkError: Sized {
|
||||||
|
/// A constructor for converting any value that satisfies the
|
||||||
|
/// `fmt::Display` trait into an error.
|
||||||
|
fn error_message<T: fmt::Display>(message: T) -> Self;
|
||||||
|
|
||||||
|
/// A constructor for converting I/O errors that occur while searching into
|
||||||
|
/// an error of this type.
|
||||||
|
///
|
||||||
|
/// By default, this is implemented via the `error_message` constructor.
|
||||||
|
fn error_io(err: io::Error) -> Self {
|
||||||
|
Self::error_message(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A constructor for converting configuration errors that occur while
|
||||||
|
/// building a searcher into an error of this type.
|
||||||
|
///
|
||||||
|
/// By default, this is implemented via the `error_message` constructor.
|
||||||
|
fn error_config(err: ConfigError) -> Self {
|
||||||
|
Self::error_message(err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An `io::Error` can be used as an error for `Sink` implementations out of
|
||||||
|
/// the box.
|
||||||
|
impl SinkError for io::Error {
|
||||||
|
fn error_message<T: fmt::Display>(message: T) -> io::Error {
|
||||||
|
io::Error::new(io::ErrorKind::Other, message.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn error_io(err: io::Error) -> io::Error {
|
||||||
|
err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A `Box<std::error::Error>` can be used as an error for `Sink`
|
||||||
|
/// implementations out of the box.
|
||||||
|
impl SinkError for Box<::std::error::Error> {
|
||||||
|
fn error_message<T: fmt::Display>(message: T) -> Box<::std::error::Error> {
|
||||||
|
Box::<::std::error::Error>::from(message.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A trait that defines how results from searchers are handled.
|
||||||
|
///
|
||||||
|
/// In this crate, a searcher follows the "push" model. What that means is that
|
||||||
|
/// the searcher drives execution, and pushes results back to the caller. This
|
||||||
|
/// is in contrast to a "pull" model where the caller drives execution and
|
||||||
|
/// takes results as they need them. These are also known as "internal" and
|
||||||
|
/// "external" iteration strategies, respectively.
|
||||||
|
///
|
||||||
|
/// For a variety of reasons, including the complexity of the searcher
|
||||||
|
/// implementation, this crate chooses the "push" or "internal" model of
|
||||||
|
/// execution. Thus, in order to act on search results, callers must provide
|
||||||
|
/// an implementation of this trait to a searcher, and the searcher is then
|
||||||
|
/// responsible for calling the methods on this trait.
|
||||||
|
///
|
||||||
|
/// This trait defines several behaviors:
|
||||||
|
///
|
||||||
|
/// * What to do when a match is found. Callers must provide this.
|
||||||
|
/// * What to do when an error occurs. Callers must provide this via the
|
||||||
|
/// [`SinkError`](trait.SinkError.html) trait. Generally, callers can just
|
||||||
|
/// use `io::Error` for this, which already implements `SinkError`.
|
||||||
|
/// * What to do when a contextual line is found. By default, these are
|
||||||
|
/// ignored.
|
||||||
|
/// * What to do when a gap between contextual lines has been found. By
|
||||||
|
/// default, this is ignored.
|
||||||
|
/// * What to do when a search has started. By default, this does nothing.
|
||||||
|
/// * What to do when a search has finished successfully. By default, this does
|
||||||
|
/// nothing.
|
||||||
|
///
|
||||||
|
/// Callers must, at minimum, specify the behavior when an error occurs and
|
||||||
|
/// the behavior when a match occurs. The rest is optional. For each behavior,
|
||||||
|
/// callers may report an error (say, if writing the result to another
|
||||||
|
/// location failed) or simply return `false` if they want the search to stop
|
||||||
|
/// (e.g., when implementing a cap on the number of search results to show).
|
||||||
|
///
|
||||||
|
/// When errors are reported (whether in the searcher or in the implementation
|
||||||
|
/// of `Sink`), then searchers quit immediately without calling `finish`.
|
||||||
|
///
|
||||||
|
/// For simpler uses of `Sink`, callers may elect to use one of
|
||||||
|
/// the more convenient but less flexible implementations in the
|
||||||
|
/// [`sinks`](sinks/index.html) module.
|
||||||
|
pub trait Sink {
|
||||||
|
/// The type of an error that should be reported by a searcher.
|
||||||
|
///
|
||||||
|
/// Errors of this type are not only returned by the methods on this
|
||||||
|
/// trait, but the constructors defined in `SinkError` are also used in
|
||||||
|
/// the searcher implementation itself. e.g., When a I/O error occurs when
|
||||||
|
/// reading data from a file.
|
||||||
|
type Error: SinkError;
|
||||||
|
|
||||||
|
/// This method is called whenever a match is found.
|
||||||
|
///
|
||||||
|
/// If multi line is enabled on the searcher, then the match reported here
|
||||||
|
/// may span multiple lines and it may include multiple matches. When multi
|
||||||
|
/// line is disabled, then the match is guaranteed to span exactly one
|
||||||
|
/// non-empty line (where a single line is, at minimum, a line terminator).
|
||||||
|
///
|
||||||
|
/// If this returns `true`, then searching continues. If this returns
|
||||||
|
/// `false`, then searching is stopped immediately and `finish` is called.
|
||||||
|
///
|
||||||
|
/// If this returns an error, then searching is stopped immediately,
|
||||||
|
/// `finish` is not called and the error is bubbled back up to the caller
|
||||||
|
/// of the searcher.
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
_mat: &SinkMatch,
|
||||||
|
) -> Result<bool, Self::Error>;
|
||||||
|
|
||||||
|
/// This method is called whenever a context line is found, and is optional
|
||||||
|
/// to implement. By default, it does nothing and returns `true`.
|
||||||
|
///
|
||||||
|
/// In all cases, the context given is guaranteed to span exactly one
|
||||||
|
/// non-empty line (where a single line is, at minimum, a line terminator).
|
||||||
|
///
|
||||||
|
/// If this returns `true`, then searching continues. If this returns
|
||||||
|
/// `false`, then searching is stopped immediately and `finish` is called.
|
||||||
|
///
|
||||||
|
/// If this returns an error, then searching is stopped immediately,
|
||||||
|
/// `finish` is not called and the error is bubbled back up to the caller
|
||||||
|
/// of the searcher.
|
||||||
|
#[inline]
|
||||||
|
fn context(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
_context: &SinkContext,
|
||||||
|
) -> Result<bool, Self::Error> {
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This method is called whenever a break in contextual lines is found,
|
||||||
|
/// and is optional to implement. By default, it does nothing and returns
|
||||||
|
/// `true`.
|
||||||
|
///
|
||||||
|
/// A break can only occur when context reporting is enabled (that is,
|
||||||
|
/// either or both of `before_context` or `after_context` are greater than
|
||||||
|
/// `0`). More precisely, a break occurs between non-contiguous groups of
|
||||||
|
/// lines.
|
||||||
|
///
|
||||||
|
/// If this returns `true`, then searching continues. If this returns
|
||||||
|
/// `false`, then searching is stopped immediately and `finish` is called.
|
||||||
|
///
|
||||||
|
/// If this returns an error, then searching is stopped immediately,
|
||||||
|
/// `finish` is not called and the error is bubbled back up to the caller
|
||||||
|
/// of the searcher.
|
||||||
|
#[inline]
|
||||||
|
fn context_break(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
) -> Result<bool, Self::Error> {
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This method is called when a search has begun, before any search is
|
||||||
|
/// executed. By default, this does nothing.
|
||||||
|
///
|
||||||
|
/// If this returns `true`, then searching continues. If this returns
|
||||||
|
/// `false`, then searching is stopped immediately and `finish` is called.
|
||||||
|
///
|
||||||
|
/// If this returns an error, then searching is stopped immediately,
|
||||||
|
/// `finish` is not called and the error is bubbled back up to the caller
|
||||||
|
/// of the searcher.
|
||||||
|
#[inline]
|
||||||
|
fn begin(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
) -> Result<bool, Self::Error> {
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This method is called when a search has completed. By default, this
|
||||||
|
/// does nothing.
|
||||||
|
///
|
||||||
|
/// If this returns an error, the error is bubbled back up to the caller of
|
||||||
|
/// the searcher.
|
||||||
|
#[inline]
|
||||||
|
fn finish(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
_: &SinkFinish,
|
||||||
|
) -> Result<(), Self::Error> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, S: Sink> Sink for &'a mut S {
|
||||||
|
type Error = S::Error;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).matched(searcher, mat)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn context(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
context: &SinkContext,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).context(searcher, context)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn context_break(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).context_break(searcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn begin(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).begin(searcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn finish(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
sink_finish: &SinkFinish,
|
||||||
|
) -> Result<(), S::Error> {
|
||||||
|
(**self).finish(searcher, sink_finish)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<S: Sink + ?Sized> Sink for Box<S> {
|
||||||
|
type Error = S::Error;
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).matched(searcher, mat)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn context(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
context: &SinkContext,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).context(searcher, context)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn context_break(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).context_break(searcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn begin(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
) -> Result<bool, S::Error> {
|
||||||
|
(**self).begin(searcher)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn finish(
|
||||||
|
&mut self,
|
||||||
|
searcher: &Searcher,
|
||||||
|
sink_finish: &SinkFinish,
|
||||||
|
) -> Result<(), S::Error> {
|
||||||
|
(**self).finish(searcher, sink_finish)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Summary data reported at the end of a search.
|
||||||
|
///
|
||||||
|
/// This reports data such as the total number of bytes searched and the
|
||||||
|
/// absolute offset of the first occurrence of binary data, if any were found.
|
||||||
|
///
|
||||||
|
/// A searcher that stops early because of an error does not call `finish`.
|
||||||
|
/// A searcher that stops early because the `Sink` implementor instructed it
|
||||||
|
/// to will still call `finish`.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SinkFinish {
|
||||||
|
pub(crate) byte_count: u64,
|
||||||
|
pub(crate) binary_byte_offset: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SinkFinish {
|
||||||
|
/// Return the total number of bytes searched.
|
||||||
|
#[inline]
|
||||||
|
pub fn byte_count(&self) -> u64 {
|
||||||
|
self.byte_count
|
||||||
|
}
|
||||||
|
|
||||||
|
/// If binary detection is enabled and if binary data was found, then this
|
||||||
|
/// returns the absolute byte offset of the first detected byte of binary
|
||||||
|
/// data.
|
||||||
|
///
|
||||||
|
/// Note that since this is an absolute byte offset, it cannot be relied
|
||||||
|
/// upon to index into any addressable memory.
|
||||||
|
#[inline]
|
||||||
|
pub fn binary_byte_offset(&self) -> Option<u64> {
|
||||||
|
self.binary_byte_offset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type that describes a match reported by a searcher.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SinkMatch<'b> {
|
||||||
|
pub(crate) line_term: LineTerminator,
|
||||||
|
pub(crate) bytes: &'b [u8],
|
||||||
|
pub(crate) absolute_byte_offset: u64,
|
||||||
|
pub(crate) line_number: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'b> SinkMatch<'b> {
|
||||||
|
/// Returns the bytes for all matching lines, including the line
|
||||||
|
/// terminators, if they exist.
|
||||||
|
#[inline]
|
||||||
|
pub fn bytes(&self) -> &'b [u8] {
|
||||||
|
self.bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return an iterator over the lines in this match.
|
||||||
|
///
|
||||||
|
/// If multi line search is enabled, then this may yield more than one
|
||||||
|
/// line (but always at least one line). If multi line search is disabled,
|
||||||
|
/// then this always reports exactly one line (but may consist of just
|
||||||
|
/// the line terminator).
|
||||||
|
///
|
||||||
|
/// Lines yielded by this iterator include their terminators.
|
||||||
|
#[inline]
|
||||||
|
pub fn lines(&self) -> LineIter<'b> {
|
||||||
|
LineIter::new(self.line_term.as_byte(), self.bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the absolute byte offset of the start of this match. This
|
||||||
|
/// offset is absolute in that it is relative to the very beginning of the
|
||||||
|
/// input in a search, and can never be relied upon to be a valid index
|
||||||
|
/// into an in-memory slice.
|
||||||
|
#[inline]
|
||||||
|
pub fn absolute_byte_offset(&self) -> u64 {
|
||||||
|
self.absolute_byte_offset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the line number of the first line in this match, if available.
|
||||||
|
///
|
||||||
|
/// Line numbers are only available when the search builder is instructed
|
||||||
|
/// to compute them.
|
||||||
|
#[inline]
|
||||||
|
pub fn line_number(&self) -> Option<u64> {
|
||||||
|
self.line_number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The type of context reported by a searcher.
|
||||||
|
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||||
|
pub enum SinkContextKind {
|
||||||
|
/// The line reported occurred before a match.
|
||||||
|
Before,
|
||||||
|
/// The line reported occurred after a match.
|
||||||
|
After,
|
||||||
|
/// Any other type of context reported, e.g., as a result of a searcher's
|
||||||
|
/// "passthru" mode.
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type that describes a contextual line reported by a searcher.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct SinkContext<'b> {
|
||||||
|
pub(crate) line_term: LineTerminator,
|
||||||
|
pub(crate) bytes: &'b [u8],
|
||||||
|
pub(crate) kind: SinkContextKind,
|
||||||
|
pub(crate) absolute_byte_offset: u64,
|
||||||
|
pub(crate) line_number: Option<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'b> SinkContext<'b> {
|
||||||
|
/// Returns the context bytes, including line terminators.
|
||||||
|
#[inline]
|
||||||
|
pub fn bytes(&self) -> &'b [u8] {
|
||||||
|
self.bytes
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the type of context.
|
||||||
|
#[inline]
|
||||||
|
pub fn kind(&self) -> &SinkContextKind {
|
||||||
|
&self.kind
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return an iterator over the lines in this match.
|
||||||
|
///
|
||||||
|
/// This always yields exactly one line (and that one line may contain just
|
||||||
|
/// the line terminator).
|
||||||
|
///
|
||||||
|
/// Lines yielded by this iterator include their terminators.
|
||||||
|
#[cfg(test)]
|
||||||
|
pub(crate) fn lines(&self) -> LineIter<'b> {
|
||||||
|
LineIter::new(self.line_term.as_byte(), self.bytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the absolute byte offset of the start of this context. This
|
||||||
|
/// offset is absolute in that it is relative to the very beginning of the
|
||||||
|
/// input in a search, and can never be relied upon to be a valid index
|
||||||
|
/// into an in-memory slice.
|
||||||
|
#[inline]
|
||||||
|
pub fn absolute_byte_offset(&self) -> u64 {
|
||||||
|
self.absolute_byte_offset
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the line number of the first line in this context, if
|
||||||
|
/// available.
|
||||||
|
///
|
||||||
|
/// Line numbers are only available when the search builder is instructed
|
||||||
|
/// to compute them.
|
||||||
|
#[inline]
|
||||||
|
pub fn line_number(&self) -> Option<u64> {
|
||||||
|
self.line_number
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A collection of convenience implementations of `Sink`.
|
||||||
|
///
|
||||||
|
/// Each implementation in this module makes some kind of sacrifice in the name
|
||||||
|
/// of making common cases easier to use. Most frequently, each type is a
|
||||||
|
/// wrapper around a closure specified by the caller that provides limited
|
||||||
|
/// access to the full suite of information available to implementors of
|
||||||
|
/// `Sink`.
|
||||||
|
///
|
||||||
|
/// For example, the `UTF8` sink makes the following sacrifices:
|
||||||
|
///
|
||||||
|
/// * All matches must be UTF-8. An arbitrary `Sink` does not have this
|
||||||
|
/// restriction and can deal with arbitrary data. If this sink sees invalid
|
||||||
|
/// UTF-8, then an error is returned and searching stops. (Use the `Lossy`
|
||||||
|
/// sink instead to suppress this error.)
|
||||||
|
/// * The searcher must be configured to report line numbers. If it isn't,
|
||||||
|
/// an error is reported at the first match and searching stops.
|
||||||
|
/// * Context lines, context breaks and summary data reported at the end of
|
||||||
|
/// a search are all ignored.
|
||||||
|
/// * Implementors are forced to use `io::Error` as their error type.
|
||||||
|
///
|
||||||
|
/// If you need more flexibility, then you're advised to implement the `Sink`
|
||||||
|
/// trait directly.
|
||||||
|
pub mod sinks {
|
||||||
|
use std::io;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
use searcher::Searcher;
|
||||||
|
use super::{Sink, SinkError, SinkMatch};
|
||||||
|
|
||||||
|
/// A sink that provides line numbers and matches as strings while ignoring
|
||||||
|
/// everything else.
|
||||||
|
///
|
||||||
|
/// This implementation will return an error if a match contains invalid
|
||||||
|
/// UTF-8 or if the searcher was not configured to count lines. Errors
|
||||||
|
/// on invalid UTF-8 can be suppressed by using the `Lossy` sink instead
|
||||||
|
/// of this one.
|
||||||
|
///
|
||||||
|
/// The closure accepts two parameters: a line number and a UTF-8 string
|
||||||
|
/// containing the matched data. The closure returns a
|
||||||
|
/// `Result<bool, io::Error>`. If the `bool` is `false`, then the search
|
||||||
|
/// stops immediately. Otherwise, searching continues.
|
||||||
|
///
|
||||||
|
/// If multi line mode was enabled, the line number refers to the line
|
||||||
|
/// number of the first line in the match.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct UTF8<F>(pub F)
|
||||||
|
where F: FnMut(u64, &str) -> Result<bool, io::Error>;
|
||||||
|
|
||||||
|
impl<F> Sink for UTF8<F>
|
||||||
|
where F: FnMut(u64, &str) -> Result<bool, io::Error>
|
||||||
|
{
|
||||||
|
type Error = io::Error;
|
||||||
|
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
let matched = match str::from_utf8(mat.bytes()) {
|
||||||
|
Ok(matched) => matched,
|
||||||
|
Err(err) => return Err(io::Error::error_message(err)),
|
||||||
|
};
|
||||||
|
let line_number = match mat.line_number() {
|
||||||
|
Some(line_number) => line_number,
|
||||||
|
None => {
|
||||||
|
let msg = "line numbers not enabled";
|
||||||
|
return Err(io::Error::error_message(msg));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
(self.0)(line_number, &matched)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A sink that provides line numbers and matches as (lossily converted)
|
||||||
|
/// strings while ignoring everything else.
|
||||||
|
///
|
||||||
|
/// This is like `UTF8`, except that if a match contains invalid UTF-8,
|
||||||
|
/// then it will be lossily converted to valid UTF-8 by substituting
|
||||||
|
/// invalid UTF-8 with Unicode replacement characters.
|
||||||
|
///
|
||||||
|
/// This implementation will return an error on the first match if the
|
||||||
|
/// searcher was not configured to count lines.
|
||||||
|
///
|
||||||
|
/// The closure accepts two parameters: a line number and a UTF-8 string
|
||||||
|
/// containing the matched data. The closure returns a
|
||||||
|
/// `Result<bool, io::Error>`. If the `bool` is `false`, then the search
|
||||||
|
/// stops immediately. Otherwise, searching continues.
|
||||||
|
///
|
||||||
|
/// If multi line mode was enabled, the line number refers to the line
|
||||||
|
/// number of the first line in the match.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Lossy<F>(pub F)
|
||||||
|
where F: FnMut(u64, &str) -> Result<bool, io::Error>;
|
||||||
|
|
||||||
|
impl<F> Sink for Lossy<F>
|
||||||
|
where F: FnMut(u64, &str) -> Result<bool, io::Error>
|
||||||
|
{
|
||||||
|
type Error = io::Error;
|
||||||
|
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
let matched = match str::from_utf8(mat.bytes()) {
|
||||||
|
Ok(matched) => Cow::Borrowed(matched),
|
||||||
|
// TODO: In theory, it should be possible to amortize
|
||||||
|
// allocation here, but `std` doesn't provide such an API.
|
||||||
|
// Regardless, this only happens on matches with invalid UTF-8,
|
||||||
|
// which should be pretty rare.
|
||||||
|
Err(_) => String::from_utf8_lossy(mat.bytes()),
|
||||||
|
};
|
||||||
|
let line_number = match mat.line_number() {
|
||||||
|
Some(line_number) => line_number,
|
||||||
|
None => {
|
||||||
|
let msg = "line numbers not enabled";
|
||||||
|
return Err(io::Error::error_message(msg));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
(self.0)(line_number, &matched)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A sink that provides line numbers and matches as raw bytes while
|
||||||
|
/// ignoring everything else.
|
||||||
|
///
|
||||||
|
/// This implementation will return an error on the first match if the
|
||||||
|
/// searcher was not configured to count lines.
|
||||||
|
///
|
||||||
|
/// The closure accepts two parameters: a line number and a raw byte string
|
||||||
|
/// containing the matched data. The closure returns a `Result<bool,
|
||||||
|
/// io::Error>`. If the `bool` is `false`, then the search stops
|
||||||
|
/// immediately. Otherwise, searching continues.
|
||||||
|
///
|
||||||
|
/// If multi line mode was enabled, the line number refers to the line
|
||||||
|
/// number of the first line in the match.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Bytes<F>(pub F)
|
||||||
|
where F: FnMut(u64, &[u8]) -> Result<bool, io::Error>;
|
||||||
|
|
||||||
|
impl<F> Sink for Bytes<F>
|
||||||
|
where F: FnMut(u64, &[u8]) -> Result<bool, io::Error>
|
||||||
|
{
|
||||||
|
type Error = io::Error;
|
||||||
|
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
let line_number = match mat.line_number() {
|
||||||
|
Some(line_number) => line_number,
|
||||||
|
None => {
|
||||||
|
let msg = "line numbers not enabled";
|
||||||
|
return Err(io::Error::error_message(msg));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
(self.0)(line_number, mat.bytes())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
787
grep-searcher/src/testutil.rs
Normal file
787
grep-searcher/src/testutil.rs
Normal file
@@ -0,0 +1,787 @@
|
|||||||
|
use std::io::{self, Write};
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
use grep_matcher::{
|
||||||
|
LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError,
|
||||||
|
};
|
||||||
|
use memchr::memchr;
|
||||||
|
use regex::bytes::{Regex, RegexBuilder};
|
||||||
|
|
||||||
|
use searcher::{BinaryDetection, Searcher, SearcherBuilder};
|
||||||
|
use sink::{Sink, SinkContext, SinkFinish, SinkMatch};
|
||||||
|
|
||||||
|
/// A simple regex matcher.
|
||||||
|
///
|
||||||
|
/// This supports setting the matcher's line terminator configuration directly,
|
||||||
|
/// which we use for testing purposes. That is, the caller explicitly
|
||||||
|
/// determines whether the line terminator optimization is enabled. (In reality
|
||||||
|
/// this optimization is detected automatically by inspecting and possibly
|
||||||
|
/// modifying the regex itself.)
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct RegexMatcher {
|
||||||
|
regex: Regex,
|
||||||
|
line_term: Option<LineTerminator>,
|
||||||
|
every_line_is_candidate: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RegexMatcher {
|
||||||
|
/// Create a new regex matcher.
|
||||||
|
pub fn new(pattern: &str) -> RegexMatcher {
|
||||||
|
let regex = RegexBuilder::new(pattern)
|
||||||
|
.multi_line(true) // permits ^ and $ to match at \n boundaries
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
RegexMatcher {
|
||||||
|
regex: regex,
|
||||||
|
line_term: None,
|
||||||
|
every_line_is_candidate: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Forcefully set the line terminator of this matcher.
|
||||||
|
///
|
||||||
|
/// By default, this matcher has no line terminator set.
|
||||||
|
pub fn set_line_term(
|
||||||
|
&mut self,
|
||||||
|
line_term: Option<LineTerminator>,
|
||||||
|
) -> &mut RegexMatcher {
|
||||||
|
self.line_term = line_term;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to return every line as a candidate or not.
|
||||||
|
///
|
||||||
|
/// This forces searchers to handle the case of reporting a false positive.
|
||||||
|
pub fn every_line_is_candidate(
|
||||||
|
&mut self,
|
||||||
|
yes: bool,
|
||||||
|
) -> &mut RegexMatcher {
|
||||||
|
self.every_line_is_candidate = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Matcher for RegexMatcher {
|
||||||
|
type Captures = NoCaptures;
|
||||||
|
type Error = NoError;
|
||||||
|
|
||||||
|
fn find_at(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
at: usize,
|
||||||
|
) -> Result<Option<Match>, NoError> {
|
||||||
|
Ok(self.regex
|
||||||
|
.find_at(haystack, at)
|
||||||
|
.map(|m| Match::new(m.start(), m.end())))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new_captures(&self) -> Result<NoCaptures, NoError> {
|
||||||
|
Ok(NoCaptures::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn line_terminator(&self) -> Option<LineTerminator> {
|
||||||
|
self.line_term
|
||||||
|
}
|
||||||
|
|
||||||
|
fn find_candidate_line(
|
||||||
|
&self,
|
||||||
|
haystack: &[u8],
|
||||||
|
) -> Result<Option<LineMatchKind>, NoError> {
|
||||||
|
if self.every_line_is_candidate {
|
||||||
|
assert!(self.line_term.is_some());
|
||||||
|
if haystack.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
// Make it interesting and return the last byte in the current
|
||||||
|
// line.
|
||||||
|
let i = memchr(self.line_term.unwrap().as_byte(), haystack)
|
||||||
|
.map(|i| i)
|
||||||
|
.unwrap_or(haystack.len() - 1);
|
||||||
|
Ok(Some(LineMatchKind::Candidate(i)))
|
||||||
|
} else {
|
||||||
|
Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An implementation of Sink that prints all available information.
|
||||||
|
///
|
||||||
|
/// This is useful for tests because it lets us easily confirm whether data
|
||||||
|
/// is being passed to Sink correctly.
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct KitchenSink(Vec<u8>);
|
||||||
|
|
||||||
|
impl KitchenSink {
|
||||||
|
/// Create a new implementation of Sink that includes everything in the
|
||||||
|
/// kitchen.
|
||||||
|
pub fn new() -> KitchenSink {
|
||||||
|
KitchenSink(vec![])
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the data written to this sink.
|
||||||
|
pub fn as_bytes(&self) -> &[u8] {
|
||||||
|
&self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Sink for KitchenSink {
|
||||||
|
type Error = io::Error;
|
||||||
|
|
||||||
|
fn matched(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
mat: &SinkMatch,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
assert!(!mat.bytes().is_empty());
|
||||||
|
assert!(mat.lines().count() >= 1);
|
||||||
|
|
||||||
|
let mut line_number = mat.line_number();
|
||||||
|
let mut byte_offset = mat.absolute_byte_offset();
|
||||||
|
for line in mat.lines() {
|
||||||
|
if let Some(ref mut n) = line_number {
|
||||||
|
write!(self.0, "{}:", n)?;
|
||||||
|
*n += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
write!(self.0, "{}:", byte_offset)?;
|
||||||
|
byte_offset += line.len() as u64;
|
||||||
|
self.0.write_all(line)?;
|
||||||
|
}
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn context(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
context: &SinkContext,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
assert!(!context.bytes().is_empty());
|
||||||
|
assert!(context.lines().count() == 1);
|
||||||
|
|
||||||
|
if let Some(line_number) = context.line_number() {
|
||||||
|
write!(self.0, "{}-", line_number)?;
|
||||||
|
}
|
||||||
|
write!(self.0, "{}-", context.absolute_byte_offset)?;
|
||||||
|
self.0.write_all(context.bytes())?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn context_break(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
) -> Result<bool, io::Error> {
|
||||||
|
self.0.write_all(b"--\n")?;
|
||||||
|
Ok(true)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn finish(
|
||||||
|
&mut self,
|
||||||
|
_searcher: &Searcher,
|
||||||
|
sink_finish: &SinkFinish,
|
||||||
|
) -> Result<(), io::Error> {
|
||||||
|
writeln!(self.0, "")?;
|
||||||
|
writeln!(self.0, "byte count:{}", sink_finish.byte_count())?;
|
||||||
|
if let Some(offset) = sink_finish.binary_byte_offset() {
|
||||||
|
writeln!(self.0, "binary offset:{}", offset)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A type for expressing tests on a searcher.
|
||||||
|
///
|
||||||
|
/// The searcher code has a lot of different code paths, mostly for the
|
||||||
|
/// purposes of optimizing a bunch of different use cases. The intent of the
|
||||||
|
/// searcher is to pick the best code path based on the configuration, which
|
||||||
|
/// means there is no obviously direct way to ask that a specific code path
|
||||||
|
/// be exercised. Thus, the purpose of this tester is to explicitly check as
|
||||||
|
/// many code paths that make sense.
|
||||||
|
///
|
||||||
|
/// The tester works by assuming you want to test all pertinent code paths.
|
||||||
|
/// These can be trimmed down as necessary via the various builder methods.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct SearcherTester {
|
||||||
|
haystack: String,
|
||||||
|
pattern: String,
|
||||||
|
filter: Option<::regex::Regex>,
|
||||||
|
print_labels: bool,
|
||||||
|
expected_no_line_number: Option<String>,
|
||||||
|
expected_with_line_number: Option<String>,
|
||||||
|
expected_slice_no_line_number: Option<String>,
|
||||||
|
expected_slice_with_line_number: Option<String>,
|
||||||
|
by_line: bool,
|
||||||
|
multi_line: bool,
|
||||||
|
invert_match: bool,
|
||||||
|
line_number: bool,
|
||||||
|
binary: BinaryDetection,
|
||||||
|
auto_heap_limit: bool,
|
||||||
|
after_context: usize,
|
||||||
|
before_context: usize,
|
||||||
|
passthru: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SearcherTester {
|
||||||
|
/// Create a new tester for testing searchers.
|
||||||
|
pub fn new(haystack: &str, pattern: &str) -> SearcherTester {
|
||||||
|
SearcherTester {
|
||||||
|
haystack: haystack.to_string(),
|
||||||
|
pattern: pattern.to_string(),
|
||||||
|
filter: None,
|
||||||
|
print_labels: false,
|
||||||
|
expected_no_line_number: None,
|
||||||
|
expected_with_line_number: None,
|
||||||
|
expected_slice_no_line_number: None,
|
||||||
|
expected_slice_with_line_number: None,
|
||||||
|
by_line: true,
|
||||||
|
multi_line: true,
|
||||||
|
invert_match: false,
|
||||||
|
line_number: true,
|
||||||
|
binary: BinaryDetection::none(),
|
||||||
|
auto_heap_limit: true,
|
||||||
|
after_context: 0,
|
||||||
|
before_context: 0,
|
||||||
|
passthru: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute the test. If the test succeeds, then this returns successfully.
|
||||||
|
/// If the test fails, then it panics with an informative message.
|
||||||
|
pub fn test(&self) {
|
||||||
|
// Check for configuration errors.
|
||||||
|
if self.expected_no_line_number.is_none() {
|
||||||
|
panic!("an 'expected' string with NO line numbers must be given");
|
||||||
|
}
|
||||||
|
if self.line_number && self.expected_with_line_number.is_none() {
|
||||||
|
panic!("an 'expected' string with line numbers must be given, \
|
||||||
|
or disable testing with line numbers");
|
||||||
|
}
|
||||||
|
|
||||||
|
let configs = self.configs();
|
||||||
|
if configs.is_empty() {
|
||||||
|
panic!("test configuration resulted in nothing being tested");
|
||||||
|
}
|
||||||
|
if self.print_labels {
|
||||||
|
for config in &configs {
|
||||||
|
let labels = vec![
|
||||||
|
format!("reader-{}", config.label),
|
||||||
|
format!("slice-{}", config.label),
|
||||||
|
];
|
||||||
|
for label in &labels {
|
||||||
|
if self.include(label) {
|
||||||
|
println!("{}", label);
|
||||||
|
} else {
|
||||||
|
println!("{} (ignored)", label);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for config in &configs {
|
||||||
|
let label = format!("reader-{}", config.label);
|
||||||
|
if self.include(&label) {
|
||||||
|
let got = config.search_reader(&self.haystack);
|
||||||
|
assert_eq_printed!(config.expected_reader, got, "{}", label);
|
||||||
|
}
|
||||||
|
|
||||||
|
let label = format!("slice-{}", config.label);
|
||||||
|
if self.include(&label) {
|
||||||
|
let got = config.search_slice(&self.haystack);
|
||||||
|
assert_eq_printed!(config.expected_slice, got, "{}", label);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set a regex pattern to filter the tests that are run.
|
||||||
|
///
|
||||||
|
/// By default, no filter is present. When a filter is set, only test
|
||||||
|
/// configurations with a label matching the given pattern will be run.
|
||||||
|
///
|
||||||
|
/// This is often useful when debugging tests, e.g., when you want to do
|
||||||
|
/// printf debugging and only want one particular test configuration to
|
||||||
|
/// execute.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn filter(&mut self, pattern: &str) -> &mut SearcherTester {
|
||||||
|
self.filter = Some(::regex::Regex::new(pattern).unwrap());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// When set, the labels for all test configurations are printed before
|
||||||
|
/// executing any test.
|
||||||
|
///
|
||||||
|
/// Note that in order to see these in tests that aren't failing, you'll
|
||||||
|
/// want to use `cargo test -- --nocapture`.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn print_labels(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.print_labels = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the expected search results, without line numbers.
|
||||||
|
pub fn expected_no_line_number(
|
||||||
|
&mut self,
|
||||||
|
exp: &str,
|
||||||
|
) -> &mut SearcherTester {
|
||||||
|
self.expected_no_line_number = Some(exp.to_string());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the expected search results, with line numbers.
|
||||||
|
pub fn expected_with_line_number(
|
||||||
|
&mut self,
|
||||||
|
exp: &str,
|
||||||
|
) -> &mut SearcherTester {
|
||||||
|
self.expected_with_line_number = Some(exp.to_string());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the expected search results, without line numbers, when performing
|
||||||
|
/// a search on a slice. When not present, `expected_no_line_number` is
|
||||||
|
/// used instead.
|
||||||
|
pub fn expected_slice_no_line_number(
|
||||||
|
&mut self,
|
||||||
|
exp: &str,
|
||||||
|
) -> &mut SearcherTester {
|
||||||
|
self.expected_slice_no_line_number = Some(exp.to_string());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the expected search results, with line numbers, when performing a
|
||||||
|
/// search on a slice. When not present, `expected_with_line_number` is
|
||||||
|
/// used instead.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn expected_slice_with_line_number(
|
||||||
|
&mut self,
|
||||||
|
exp: &str,
|
||||||
|
) -> &mut SearcherTester {
|
||||||
|
self.expected_slice_with_line_number = Some(exp.to_string());
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to test search with line numbers or not.
|
||||||
|
///
|
||||||
|
/// This is enabled by default. When enabled, the string that is expected
|
||||||
|
/// when line numbers are present must be provided. Otherwise, the expected
|
||||||
|
/// string isn't required.
|
||||||
|
pub fn line_number(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.line_number = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to test search using the line-by-line searcher or not.
|
||||||
|
///
|
||||||
|
/// By default, this is enabled.
|
||||||
|
pub fn by_line(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.by_line = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to test search using the multi line searcher or not.
|
||||||
|
///
|
||||||
|
/// By default, this is enabled.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn multi_line(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.multi_line = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to perform an inverted search or not.
|
||||||
|
///
|
||||||
|
/// By default, this is disabled.
|
||||||
|
pub fn invert_match(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.invert_match = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable binary detection on all searches.
|
||||||
|
///
|
||||||
|
/// By default, this is disabled.
|
||||||
|
pub fn binary_detection(
|
||||||
|
&mut self,
|
||||||
|
detection: BinaryDetection,
|
||||||
|
) -> &mut SearcherTester {
|
||||||
|
self.binary = detection;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to automatically attempt to test the heap limit setting or not.
|
||||||
|
///
|
||||||
|
/// By default, one of the test configurations includes setting the heap
|
||||||
|
/// limit to its minimal value for normal operation, which checks that
|
||||||
|
/// everything works even at the extremes. However, in some cases, the heap
|
||||||
|
/// limit can (expectedly) alter the output slightly. For example, it can
|
||||||
|
/// impact the number of bytes searched when performing binary detection.
|
||||||
|
/// For convenience, it can be useful to disable the automatic heap limit
|
||||||
|
/// test.
|
||||||
|
pub fn auto_heap_limit(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.auto_heap_limit = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the number of lines to include in the "after" context.
|
||||||
|
///
|
||||||
|
/// The default is `0`, which is equivalent to not printing any context.
|
||||||
|
pub fn after_context(&mut self, lines: usize) -> &mut SearcherTester {
|
||||||
|
self.after_context = lines;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the number of lines to include in the "before" context.
|
||||||
|
///
|
||||||
|
/// The default is `0`, which is equivalent to not printing any context.
|
||||||
|
pub fn before_context(&mut self, lines: usize) -> &mut SearcherTester {
|
||||||
|
self.before_context = lines;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether to enable the "passthru" feature or not.
|
||||||
|
///
|
||||||
|
/// When passthru is enabled, it effectively treats all non-matching lines
|
||||||
|
/// as contextual lines. In other words, enabling this is akin to
|
||||||
|
/// requesting an unbounded number of before and after contextual lines.
|
||||||
|
///
|
||||||
|
/// This is disabled by default.
|
||||||
|
pub fn passthru(&mut self, yes: bool) -> &mut SearcherTester {
|
||||||
|
self.passthru = yes;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the minimum size of a buffer required for a successful search.
|
||||||
|
///
|
||||||
|
/// Generally, this corresponds to the maximum length of a line (including
|
||||||
|
/// its terminator), but if context settings are enabled, then this must
|
||||||
|
/// include the sum of the longest N lines.
|
||||||
|
///
|
||||||
|
/// Note that this must account for whether the test is using multi line
|
||||||
|
/// search or not, since multi line search requires being able to fit the
|
||||||
|
/// entire haystack into memory.
|
||||||
|
fn minimal_heap_limit(&self, multi_line: bool) -> usize {
|
||||||
|
if multi_line {
|
||||||
|
1 + self.haystack.len()
|
||||||
|
} else if self.before_context == 0 && self.after_context == 0 {
|
||||||
|
1 + self.haystack.lines().map(|s| s.len()).max().unwrap_or(0)
|
||||||
|
} else {
|
||||||
|
let mut lens: Vec<usize> =
|
||||||
|
self.haystack.lines().map(|s| s.len()).collect();
|
||||||
|
lens.sort();
|
||||||
|
lens.reverse();
|
||||||
|
|
||||||
|
let context_count =
|
||||||
|
if self.passthru {
|
||||||
|
self.haystack.lines().count()
|
||||||
|
} else {
|
||||||
|
// Why do we add 2 here? Well, we need to add 1 in order to
|
||||||
|
// have room to search at least one line. We add another
|
||||||
|
// because the implementation will occasionally include
|
||||||
|
// an additional line when handling the context. There's
|
||||||
|
// no particularly good reason, other than keeping the
|
||||||
|
// implementation simple.
|
||||||
|
2 + self.before_context + self.after_context
|
||||||
|
};
|
||||||
|
|
||||||
|
// We add 1 to each line since `str::lines` doesn't include the
|
||||||
|
// line terminator.
|
||||||
|
lens.into_iter()
|
||||||
|
.take(context_count)
|
||||||
|
.map(|len| len + 1)
|
||||||
|
.sum::<usize>()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if and only if the given label should be included as part
|
||||||
|
/// of executing `test`.
|
||||||
|
///
|
||||||
|
/// Inclusion is determined by the filter specified. If no filter has been
|
||||||
|
/// given, then this always returns `true`.
|
||||||
|
fn include(&self, label: &str) -> bool {
|
||||||
|
let re = match self.filter {
|
||||||
|
None => return true,
|
||||||
|
Some(ref re) => re,
|
||||||
|
};
|
||||||
|
re.is_match(label)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configs generates a set of all search configurations that should be
|
||||||
|
/// tested. The configs generated are based on the configuration in this
|
||||||
|
/// builder.
|
||||||
|
fn configs(&self) -> Vec<TesterConfig> {
|
||||||
|
let mut configs = vec![];
|
||||||
|
|
||||||
|
let matcher = RegexMatcher::new(&self.pattern);
|
||||||
|
let mut builder = SearcherBuilder::new();
|
||||||
|
builder
|
||||||
|
.line_number(false)
|
||||||
|
.invert_match(self.invert_match)
|
||||||
|
.binary_detection(self.binary.clone())
|
||||||
|
.after_context(self.after_context)
|
||||||
|
.before_context(self.before_context)
|
||||||
|
.passthru(self.passthru);
|
||||||
|
|
||||||
|
if self.by_line {
|
||||||
|
let mut matcher = matcher.clone();
|
||||||
|
let mut builder = builder.clone();
|
||||||
|
|
||||||
|
let expected_reader =
|
||||||
|
self.expected_no_line_number.as_ref().unwrap().to_string();
|
||||||
|
let expected_slice = match self.expected_slice_no_line_number {
|
||||||
|
None => expected_reader.clone(),
|
||||||
|
Some(ref e) => e.to_string(),
|
||||||
|
};
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-noterm-nonumber".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
if self.auto_heap_limit {
|
||||||
|
builder.heap_limit(Some(self.minimal_heap_limit(false)));
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-noterm-nonumber-heaplimit".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
builder.heap_limit(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
matcher.set_line_term(Some(LineTerminator::byte(b'\n')));
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-term-nonumber".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
matcher.every_line_is_candidate(true);
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-term-nonumber-candidates".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if self.by_line && self.line_number {
|
||||||
|
let mut matcher = matcher.clone();
|
||||||
|
let mut builder = builder.clone();
|
||||||
|
|
||||||
|
let expected_reader =
|
||||||
|
self.expected_with_line_number.as_ref().unwrap().to_string();
|
||||||
|
let expected_slice = match self.expected_slice_with_line_number {
|
||||||
|
None => expected_reader.clone(),
|
||||||
|
Some(ref e) => e.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.line_number(true);
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-noterm-number".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
matcher.set_line_term(Some(LineTerminator::byte(b'\n')));
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-term-number".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
matcher.every_line_is_candidate(true);
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "byline-term-number-candidates".to_string(),
|
||||||
|
expected_reader: expected_reader.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if self.multi_line {
|
||||||
|
let mut builder = builder.clone();
|
||||||
|
let expected_slice = match self.expected_slice_no_line_number {
|
||||||
|
None => {
|
||||||
|
self.expected_no_line_number.as_ref().unwrap().to_string()
|
||||||
|
}
|
||||||
|
Some(ref e) => e.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.multi_line(true);
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "multiline-nonumber".to_string(),
|
||||||
|
expected_reader: expected_slice.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
if self.auto_heap_limit {
|
||||||
|
builder.heap_limit(Some(self.minimal_heap_limit(true)));
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "multiline-nonumber-heaplimit".to_string(),
|
||||||
|
expected_reader: expected_slice.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
builder.heap_limit(None);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if self.multi_line && self.line_number {
|
||||||
|
let mut builder = builder.clone();
|
||||||
|
let expected_slice = match self.expected_slice_with_line_number {
|
||||||
|
None => {
|
||||||
|
self.expected_with_line_number
|
||||||
|
.as_ref().unwrap().to_string()
|
||||||
|
}
|
||||||
|
Some(ref e) => e.to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
builder.multi_line(true);
|
||||||
|
builder.line_number(true);
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "multiline-number".to_string(),
|
||||||
|
expected_reader: expected_slice.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
builder.heap_limit(Some(self.minimal_heap_limit(true)));
|
||||||
|
configs.push(TesterConfig {
|
||||||
|
label: "multiline-number-heaplimit".to_string(),
|
||||||
|
expected_reader: expected_slice.clone(),
|
||||||
|
expected_slice: expected_slice.clone(),
|
||||||
|
builder: builder.clone(),
|
||||||
|
matcher: matcher.clone(),
|
||||||
|
});
|
||||||
|
builder.heap_limit(None);
|
||||||
|
}
|
||||||
|
configs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct TesterConfig {
|
||||||
|
label: String,
|
||||||
|
expected_reader: String,
|
||||||
|
expected_slice: String,
|
||||||
|
builder: SearcherBuilder,
|
||||||
|
matcher: RegexMatcher,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TesterConfig {
|
||||||
|
/// Execute a search using a reader. This exercises the incremental search
|
||||||
|
/// strategy, where the entire contents of the corpus aren't necessarily
|
||||||
|
/// in memory at once.
|
||||||
|
fn search_reader(&self, haystack: &str) -> String {
|
||||||
|
let mut sink = KitchenSink::new();
|
||||||
|
let mut searcher = self.builder.build();
|
||||||
|
let result = searcher.search_reader(
|
||||||
|
&self.matcher,
|
||||||
|
haystack.as_bytes(),
|
||||||
|
&mut sink,
|
||||||
|
);
|
||||||
|
if let Err(err) = result {
|
||||||
|
let label = format!("reader-{}", self.label);
|
||||||
|
panic!("error running '{}': {}", label, err);
|
||||||
|
}
|
||||||
|
String::from_utf8(sink.as_bytes().to_vec()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Execute a search using a slice. This exercises the search routines that
|
||||||
|
/// have the entire contents of the corpus in memory at one time.
|
||||||
|
fn search_slice(&self, haystack: &str) -> String {
|
||||||
|
let mut sink = KitchenSink::new();
|
||||||
|
let mut searcher = self.builder.build();
|
||||||
|
let result = searcher.search_slice(
|
||||||
|
&self.matcher,
|
||||||
|
haystack.as_bytes(),
|
||||||
|
&mut sink,
|
||||||
|
);
|
||||||
|
if let Err(err) = result {
|
||||||
|
let label = format!("slice-{}", self.label);
|
||||||
|
panic!("error running '{}': {}", label, err);
|
||||||
|
}
|
||||||
|
String::from_utf8(sink.as_bytes().to_vec()).unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use grep_matcher::{Match, Matcher};
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn m(start: usize, end: usize) -> Match {
|
||||||
|
Match::new(start, end)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_line1() {
|
||||||
|
let haystack = b"";
|
||||||
|
let matcher = RegexMatcher::new(r"^$");
|
||||||
|
|
||||||
|
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_line2() {
|
||||||
|
let haystack = b"\n";
|
||||||
|
let matcher = RegexMatcher::new(r"^$");
|
||||||
|
|
||||||
|
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_line3() {
|
||||||
|
let haystack = b"\n\n";
|
||||||
|
let matcher = RegexMatcher::new(r"^$");
|
||||||
|
|
||||||
|
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_line4() {
|
||||||
|
let haystack = b"a\n\nb\n";
|
||||||
|
let matcher = RegexMatcher::new(r"^$");
|
||||||
|
|
||||||
|
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 3), Ok(Some(m(5, 5))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 4), Ok(Some(m(5, 5))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 5), Ok(Some(m(5, 5))));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_line5() {
|
||||||
|
let haystack = b"a\n\nb\nc";
|
||||||
|
let matcher = RegexMatcher::new(r"^$");
|
||||||
|
|
||||||
|
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 3), Ok(None));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 4), Ok(None));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 5), Ok(None));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 6), Ok(None));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn empty_line6() {
|
||||||
|
let haystack = b"a\n";
|
||||||
|
let matcher = RegexMatcher::new(r"^$");
|
||||||
|
|
||||||
|
assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2))));
|
||||||
|
assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2))));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "grep"
|
name = "grep"
|
||||||
version = "0.1.9" #:version
|
version = "0.2.3" #:version
|
||||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||||
description = """
|
description = """
|
||||||
Fast line oriented regex searching as a library.
|
Fast line oriented regex searching as a library.
|
||||||
@@ -13,7 +13,20 @@ keywords = ["regex", "grep", "egrep", "search", "pattern"]
|
|||||||
license = "Unlicense/MIT"
|
license = "Unlicense/MIT"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
log = "0.4"
|
grep-cli = { version = "0.1.1", path = "../grep-cli" }
|
||||||
memchr = "2"
|
grep-matcher = { version = "0.1.1", path = "../grep-matcher" }
|
||||||
regex = "1"
|
grep-pcre2 = { version = "0.1.2", path = "../grep-pcre2", optional = true }
|
||||||
regex-syntax = "0.6"
|
grep-printer = { version = "0.1.1", path = "../grep-printer" }
|
||||||
|
grep-regex = { version = "0.1.1", path = "../grep-regex" }
|
||||||
|
grep-searcher = { version = "0.1.1", path = "../grep-searcher" }
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
termcolor = "1.0.4"
|
||||||
|
walkdir = "2.2.7"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
simd-accel = ["grep-searcher/simd-accel"]
|
||||||
|
pcre2 = ["grep-pcre2"]
|
||||||
|
|
||||||
|
# This feature is DEPRECATED. Runtime dispatch is used for SIMD now.
|
||||||
|
avx-accel = []
|
||||||
|
|||||||
@@ -1,4 +1,41 @@
|
|||||||
grep
|
grep
|
||||||
----
|
----
|
||||||
This is a *library* that provides grep-style line-by-line regex searching (with
|
ripgrep, as a library.
|
||||||
comparable performance to `grep` itself).
|
|
||||||
|
[](https://travis-ci.org/BurntSushi/ripgrep)
|
||||||
|
[](https://ci.appveyor.com/project/BurntSushi/ripgrep)
|
||||||
|
[](https://crates.io/crates/grep)
|
||||||
|
|
||||||
|
Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
|
||||||
|
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
[https://docs.rs/grep](https://docs.rs/grep)
|
||||||
|
|
||||||
|
NOTE: This crate isn't ready for wide use yet. Ambitious individuals can
|
||||||
|
probably piece together the parts, but there is no high level documentation
|
||||||
|
describing how all of the pieces fit together.
|
||||||
|
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
Add this to your `Cargo.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[dependencies]
|
||||||
|
grep = "0.2"
|
||||||
|
```
|
||||||
|
|
||||||
|
and this to your crate root:
|
||||||
|
|
||||||
|
```rust
|
||||||
|
extern crate grep;
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
This crate provides a `pcre2` feature (disabled by default) which, when
|
||||||
|
enabled, re-exports the `grep-pcre2` crate as an alternative `Matcher`
|
||||||
|
implementation to the standard `grep-regex` implementation.
|
||||||
|
|||||||
74
grep/examples/simplegrep.rs
Normal file
74
grep/examples/simplegrep.rs
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
extern crate grep;
|
||||||
|
extern crate termcolor;
|
||||||
|
extern crate walkdir;
|
||||||
|
|
||||||
|
use std::env;
|
||||||
|
use std::error::Error;
|
||||||
|
use std::ffi::OsString;
|
||||||
|
use std::process;
|
||||||
|
|
||||||
|
use grep::cli;
|
||||||
|
use grep::printer::{ColorSpecs, StandardBuilder};
|
||||||
|
use grep::regex::RegexMatcher;
|
||||||
|
use grep::searcher::{BinaryDetection, SearcherBuilder};
|
||||||
|
use termcolor::ColorChoice;
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
if let Err(err) = try_main() {
|
||||||
|
eprintln!("{}", err);
|
||||||
|
process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_main() -> Result<(), Box<Error>> {
|
||||||
|
let mut args: Vec<OsString> = env::args_os().collect();
|
||||||
|
if args.len() < 2 {
|
||||||
|
return Err("Usage: simplegrep <pattern> [<path> ...]".into());
|
||||||
|
}
|
||||||
|
if args.len() == 2 {
|
||||||
|
args.push(OsString::from("./"));
|
||||||
|
}
|
||||||
|
search(cli::pattern_from_os(&args[1])?, &args[2..])
|
||||||
|
}
|
||||||
|
|
||||||
|
fn search(pattern: &str, paths: &[OsString]) -> Result<(), Box<Error>> {
|
||||||
|
let matcher = RegexMatcher::new_line_matcher(&pattern)?;
|
||||||
|
let mut searcher = SearcherBuilder::new()
|
||||||
|
.binary_detection(BinaryDetection::quit(b'\x00'))
|
||||||
|
.line_number(false)
|
||||||
|
.build();
|
||||||
|
let mut printer = StandardBuilder::new()
|
||||||
|
.color_specs(ColorSpecs::default_with_color())
|
||||||
|
.build(cli::stdout(
|
||||||
|
if cli::is_tty_stdout() {
|
||||||
|
ColorChoice::Auto
|
||||||
|
} else {
|
||||||
|
ColorChoice::Never
|
||||||
|
}
|
||||||
|
));
|
||||||
|
|
||||||
|
for path in paths {
|
||||||
|
for result in WalkDir::new(path) {
|
||||||
|
let dent = match result {
|
||||||
|
Ok(dent) => dent,
|
||||||
|
Err(err) => {
|
||||||
|
eprintln!("{}", err);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if !dent.file_type().is_file() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let result = searcher.search_path(
|
||||||
|
&matcher,
|
||||||
|
dent.path(),
|
||||||
|
printer.sink_with_path(&matcher, dent.path()),
|
||||||
|
);
|
||||||
|
if let Err(err) = result {
|
||||||
|
eprintln!("{}: {}", dent.path().display(), err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -1,84 +1,23 @@
|
|||||||
#![deny(missing_docs)]
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
A fast line oriented regex searcher.
|
ripgrep, as a library.
|
||||||
|
|
||||||
|
This library is intended to provide a high level facade to the crates that
|
||||||
|
make up ripgrep's core searching routines. However, there is no high level
|
||||||
|
documentation available yet guiding users on how to fit all of the pieces
|
||||||
|
together.
|
||||||
|
|
||||||
|
Every public API item in the constituent crates is documented, but examples
|
||||||
|
are sparse.
|
||||||
|
|
||||||
|
A cookbook and a guide are planned.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#[macro_use]
|
#![deny(missing_docs)]
|
||||||
extern crate log;
|
|
||||||
extern crate memchr;
|
|
||||||
extern crate regex;
|
|
||||||
extern crate regex_syntax as syntax;
|
|
||||||
|
|
||||||
use std::error;
|
pub extern crate grep_cli as cli;
|
||||||
use std::fmt;
|
pub extern crate grep_matcher as matcher;
|
||||||
use std::result;
|
#[cfg(feature = "pcre2")]
|
||||||
|
pub extern crate grep_pcre2 as pcre2;
|
||||||
pub use search::{Grep, GrepBuilder, Iter, Match};
|
pub extern crate grep_printer as printer;
|
||||||
|
pub extern crate grep_regex as regex;
|
||||||
mod literals;
|
pub extern crate grep_searcher as searcher;
|
||||||
mod nonl;
|
|
||||||
mod search;
|
|
||||||
mod smart_case;
|
|
||||||
mod word_boundary;
|
|
||||||
|
|
||||||
/// Result is a convenient type alias that fixes the type of the error to
|
|
||||||
/// the `Error` type defined in this crate.
|
|
||||||
pub type Result<T> = result::Result<T, Error>;
|
|
||||||
|
|
||||||
/// Error enumerates the list of possible error conditions when building or
|
|
||||||
/// using a `Grep` line searcher.
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub enum Error {
|
|
||||||
/// An error from parsing or compiling a regex.
|
|
||||||
Regex(regex::Error),
|
|
||||||
/// This error occurs when an illegal literal was found in the regex
|
|
||||||
/// pattern. For example, if the line terminator is `\n` and the regex
|
|
||||||
/// pattern is `\w+\n\w+`, then the presence of `\n` will cause this error.
|
|
||||||
LiteralNotAllowed(char),
|
|
||||||
/// An unused enum variant that indicates this enum may be expanded in
|
|
||||||
/// the future and therefore should not be exhaustively matched.
|
|
||||||
#[doc(hidden)]
|
|
||||||
__Nonexhaustive,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl error::Error for Error {
|
|
||||||
fn description(&self) -> &str {
|
|
||||||
match *self {
|
|
||||||
Error::Regex(ref err) => err.description(),
|
|
||||||
Error::LiteralNotAllowed(_) => "use of forbidden literal",
|
|
||||||
Error::__Nonexhaustive => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn cause(&self) -> Option<&error::Error> {
|
|
||||||
match *self {
|
|
||||||
Error::Regex(ref err) => err.cause(),
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for Error {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
match *self {
|
|
||||||
Error::Regex(ref err) => err.fmt(f),
|
|
||||||
Error::LiteralNotAllowed(chr) => {
|
|
||||||
write!(f, "Literal {:?} not allowed.", chr)
|
|
||||||
}
|
|
||||||
Error::__Nonexhaustive => unreachable!(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<regex::Error> for Error {
|
|
||||||
fn from(err: regex::Error) -> Error {
|
|
||||||
Error::Regex(err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<syntax::Error> for Error {
|
|
||||||
fn from(err: syntax::Error) -> Error {
|
|
||||||
Error::Regex(regex::Error::Syntax(err.to_string()))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -1,74 +0,0 @@
|
|||||||
use syntax::hir::{self, Hir, HirKind};
|
|
||||||
|
|
||||||
use {Error, Result};
|
|
||||||
|
|
||||||
/// Returns a new expression that is guaranteed to never match the given
|
|
||||||
/// ASCII character.
|
|
||||||
///
|
|
||||||
/// If the expression contains the literal byte, then an error is returned.
|
|
||||||
///
|
|
||||||
/// If `byte` is not an ASCII character (i.e., greater than `0x7F`), then this
|
|
||||||
/// function panics.
|
|
||||||
pub fn remove(expr: Hir, byte: u8) -> Result<Hir> {
|
|
||||||
assert!(byte <= 0x7F);
|
|
||||||
let chr = byte as char;
|
|
||||||
assert!(chr.len_utf8() == 1);
|
|
||||||
|
|
||||||
Ok(match expr.into_kind() {
|
|
||||||
HirKind::Empty => Hir::empty(),
|
|
||||||
HirKind::Literal(hir::Literal::Unicode(c)) => {
|
|
||||||
if c == chr {
|
|
||||||
return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
Hir::literal(hir::Literal::Unicode(c))
|
|
||||||
}
|
|
||||||
HirKind::Literal(hir::Literal::Byte(b)) => {
|
|
||||||
if b as char == chr {
|
|
||||||
return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
Hir::literal(hir::Literal::Byte(b))
|
|
||||||
}
|
|
||||||
HirKind::Class(hir::Class::Unicode(mut cls)) => {
|
|
||||||
let remove = hir::ClassUnicode::new(Some(
|
|
||||||
hir::ClassUnicodeRange::new(chr, chr),
|
|
||||||
));
|
|
||||||
cls.difference(&remove);
|
|
||||||
if cls.iter().next().is_none() {
|
|
||||||
return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
Hir::class(hir::Class::Unicode(cls))
|
|
||||||
}
|
|
||||||
HirKind::Class(hir::Class::Bytes(mut cls)) => {
|
|
||||||
let remove = hir::ClassBytes::new(Some(
|
|
||||||
hir::ClassBytesRange::new(byte, byte),
|
|
||||||
));
|
|
||||||
cls.difference(&remove);
|
|
||||||
if cls.iter().next().is_none() {
|
|
||||||
return Err(Error::LiteralNotAllowed(chr));
|
|
||||||
}
|
|
||||||
Hir::class(hir::Class::Bytes(cls))
|
|
||||||
}
|
|
||||||
HirKind::Anchor(x) => Hir::anchor(x),
|
|
||||||
HirKind::WordBoundary(x) => Hir::word_boundary(x),
|
|
||||||
HirKind::Repetition(mut x) => {
|
|
||||||
x.hir = Box::new(remove(*x.hir, byte)?);
|
|
||||||
Hir::repetition(x)
|
|
||||||
}
|
|
||||||
HirKind::Group(mut x) => {
|
|
||||||
x.hir = Box::new(remove(*x.hir, byte)?);
|
|
||||||
Hir::group(x)
|
|
||||||
}
|
|
||||||
HirKind::Concat(xs) => {
|
|
||||||
let xs = xs.into_iter()
|
|
||||||
.map(|e| remove(e, byte))
|
|
||||||
.collect::<Result<Vec<Hir>>>()?;
|
|
||||||
Hir::concat(xs)
|
|
||||||
}
|
|
||||||
HirKind::Alternation(xs) => {
|
|
||||||
let xs = xs.into_iter()
|
|
||||||
.map(|e| remove(e, byte))
|
|
||||||
.collect::<Result<Vec<Hir>>>()?;
|
|
||||||
Hir::alternation(xs)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
@@ -1,356 +0,0 @@
|
|||||||
use memchr::{memchr, memrchr};
|
|
||||||
use syntax::ParserBuilder;
|
|
||||||
use syntax::hir::Hir;
|
|
||||||
use regex::bytes::{Regex, RegexBuilder};
|
|
||||||
|
|
||||||
use literals::LiteralSets;
|
|
||||||
use nonl;
|
|
||||||
use smart_case::Cased;
|
|
||||||
use word_boundary::strip_unicode_word_boundaries;
|
|
||||||
use Result;
|
|
||||||
|
|
||||||
/// A matched line.
|
|
||||||
#[derive(Clone, Debug, Default, Eq, PartialEq)]
|
|
||||||
pub struct Match {
|
|
||||||
start: usize,
|
|
||||||
end: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Match {
|
|
||||||
/// Create a new empty match value.
|
|
||||||
pub fn new() -> Match {
|
|
||||||
Match::default()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the starting byte offset of the line that matched.
|
|
||||||
#[inline]
|
|
||||||
pub fn start(&self) -> usize {
|
|
||||||
self.start
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the ending byte offset of the line that matched.
|
|
||||||
#[inline]
|
|
||||||
pub fn end(&self) -> usize {
|
|
||||||
self.end
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A fast line oriented regex searcher.
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct Grep {
|
|
||||||
re: Regex,
|
|
||||||
required: Option<Regex>,
|
|
||||||
opts: Options,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A builder for a grep searcher.
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
pub struct GrepBuilder {
|
|
||||||
pattern: String,
|
|
||||||
opts: Options,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
|
||||||
struct Options {
|
|
||||||
case_insensitive: bool,
|
|
||||||
case_smart: bool,
|
|
||||||
line_terminator: u8,
|
|
||||||
size_limit: usize,
|
|
||||||
dfa_size_limit: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Options {
|
|
||||||
fn default() -> Options {
|
|
||||||
Options {
|
|
||||||
case_insensitive: false,
|
|
||||||
case_smart: false,
|
|
||||||
line_terminator: b'\n',
|
|
||||||
size_limit: 10 * (1 << 20),
|
|
||||||
dfa_size_limit: 10 * (1 << 20),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl GrepBuilder {
|
|
||||||
/// Create a new builder for line searching.
|
|
||||||
///
|
|
||||||
/// The pattern given should be a regular expression. The precise syntax
|
|
||||||
/// supported is documented on the regex crate.
|
|
||||||
pub fn new(pattern: &str) -> GrepBuilder {
|
|
||||||
GrepBuilder {
|
|
||||||
pattern: pattern.to_string(),
|
|
||||||
opts: Options::default(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the line terminator.
|
|
||||||
///
|
|
||||||
/// The line terminator can be any ASCII character and serves to delineate
|
|
||||||
/// the match boundaries in the text searched.
|
|
||||||
///
|
|
||||||
/// This panics if `ascii_byte` is greater than `0x7F` (i.e., not ASCII).
|
|
||||||
pub fn line_terminator(mut self, ascii_byte: u8) -> GrepBuilder {
|
|
||||||
assert!(ascii_byte <= 0x7F);
|
|
||||||
self.opts.line_terminator = ascii_byte;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the case sensitive flag (`i`) on the regex.
|
|
||||||
pub fn case_insensitive(mut self, yes: bool) -> GrepBuilder {
|
|
||||||
self.opts.case_insensitive = yes;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Whether to enable smart case search or not (disabled by default).
|
|
||||||
///
|
|
||||||
/// Smart case uses case insensitive search if the pattern contains only
|
|
||||||
/// lowercase characters (ignoring any characters which immediately follow
|
|
||||||
/// a '\'). Otherwise, a case sensitive search is used instead.
|
|
||||||
///
|
|
||||||
/// Enabling the case_insensitive flag overrides this.
|
|
||||||
pub fn case_smart(mut self, yes: bool) -> GrepBuilder {
|
|
||||||
self.opts.case_smart = yes;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the approximate size limit of the compiled regular expression.
|
|
||||||
///
|
|
||||||
/// This roughly corresponds to the number of bytes occupied by a
|
|
||||||
/// single compiled program. If the program exceeds this number, then a
|
|
||||||
/// compilation error is returned.
|
|
||||||
pub fn size_limit(mut self, limit: usize) -> GrepBuilder {
|
|
||||||
self.opts.size_limit = limit;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the approximate size of the cache used by the DFA.
|
|
||||||
///
|
|
||||||
/// This roughly corresponds to the number of bytes that the DFA will use
|
|
||||||
/// while searching.
|
|
||||||
///
|
|
||||||
/// Note that this is a per thread limit. There is no way to set a global
|
|
||||||
/// limit. In particular, if a regex is used from multiple threads
|
|
||||||
/// simulanteously, then each thread may use up to the number of bytes
|
|
||||||
/// specified here.
|
|
||||||
pub fn dfa_size_limit(mut self, limit: usize) -> GrepBuilder {
|
|
||||||
self.opts.dfa_size_limit = limit;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a line searcher.
|
|
||||||
///
|
|
||||||
/// If there was a problem parsing or compiling the regex with the given
|
|
||||||
/// options, then an error is returned.
|
|
||||||
pub fn build(self) -> Result<Grep> {
|
|
||||||
let expr = self.parse()?;
|
|
||||||
let literals = LiteralSets::create(&expr);
|
|
||||||
let re = self.regex(&expr)?;
|
|
||||||
let required = match literals.to_regex_builder() {
|
|
||||||
Some(builder) => Some(self.regex_build(builder)?),
|
|
||||||
None => {
|
|
||||||
match strip_unicode_word_boundaries(&expr) {
|
|
||||||
None => None,
|
|
||||||
Some(expr) => {
|
|
||||||
debug!("Stripped Unicode word boundaries. \
|
|
||||||
New AST:\n{:?}", expr);
|
|
||||||
self.regex(&expr).ok()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
Ok(Grep {
|
|
||||||
re: re,
|
|
||||||
required: required,
|
|
||||||
opts: self.opts,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new regex from the given expression with the current
|
|
||||||
/// configuration.
|
|
||||||
fn regex(&self, expr: &Hir) -> Result<Regex> {
|
|
||||||
let mut builder = RegexBuilder::new(&expr.to_string());
|
|
||||||
builder.unicode(true);
|
|
||||||
self.regex_build(builder)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Builds a new regex from the given builder using the caller's settings.
|
|
||||||
fn regex_build(&self, mut builder: RegexBuilder) -> Result<Regex> {
|
|
||||||
builder
|
|
||||||
.multi_line(true)
|
|
||||||
.size_limit(self.opts.size_limit)
|
|
||||||
.dfa_size_limit(self.opts.dfa_size_limit)
|
|
||||||
.build()
|
|
||||||
.map_err(From::from)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parses the underlying pattern and ensures the pattern can never match
|
|
||||||
/// the line terminator.
|
|
||||||
fn parse(&self) -> Result<Hir> {
|
|
||||||
let expr = ParserBuilder::new()
|
|
||||||
.allow_invalid_utf8(true)
|
|
||||||
.case_insensitive(self.is_case_insensitive()?)
|
|
||||||
.multi_line(true)
|
|
||||||
.build()
|
|
||||||
.parse(&self.pattern)?;
|
|
||||||
debug!("original regex HIR pattern:\n{}", expr);
|
|
||||||
let expr = nonl::remove(expr, self.opts.line_terminator)?;
|
|
||||||
debug!("transformed regex HIR pattern:\n{}", expr);
|
|
||||||
Ok(expr)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Determines whether the case insensitive flag should be enabled or not.
|
|
||||||
fn is_case_insensitive(&self) -> Result<bool> {
|
|
||||||
if self.opts.case_insensitive {
|
|
||||||
return Ok(true);
|
|
||||||
}
|
|
||||||
if !self.opts.case_smart {
|
|
||||||
return Ok(false);
|
|
||||||
}
|
|
||||||
let cased = match Cased::from_pattern(&self.pattern) {
|
|
||||||
None => return Ok(false),
|
|
||||||
Some(cased) => cased,
|
|
||||||
};
|
|
||||||
Ok(cased.any_literal && !cased.any_uppercase)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Grep {
|
|
||||||
/// Returns a reference to the underlying regex used by the searcher.
|
|
||||||
pub fn regex(&self) -> &Regex {
|
|
||||||
&self.re
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns an iterator over all matches in the given buffer.
|
|
||||||
pub fn iter<'b, 's>(&'s self, buf: &'b [u8]) -> Iter<'b, 's> {
|
|
||||||
Iter {
|
|
||||||
searcher: self,
|
|
||||||
buf: buf,
|
|
||||||
start: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fills in the next line that matches in the given buffer starting at
|
|
||||||
/// the position given.
|
|
||||||
///
|
|
||||||
/// If no match could be found, `false` is returned, otherwise, `true` is
|
|
||||||
/// returned.
|
|
||||||
pub fn read_match(
|
|
||||||
&self,
|
|
||||||
mat: &mut Match,
|
|
||||||
buf: &[u8],
|
|
||||||
mut start: usize,
|
|
||||||
) -> bool {
|
|
||||||
if start >= buf.len() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if let Some(ref req) = self.required {
|
|
||||||
while start < buf.len() {
|
|
||||||
let e = match req.shortest_match(&buf[start..]) {
|
|
||||||
None => return false,
|
|
||||||
Some(e) => start + e,
|
|
||||||
};
|
|
||||||
let (prevnl, nextnl) = self.find_line(buf, e, e);
|
|
||||||
match self.re.shortest_match(&buf[prevnl..nextnl]) {
|
|
||||||
None => {
|
|
||||||
start = nextnl;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Some(_) => {
|
|
||||||
self.fill_match(mat, prevnl, nextnl);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
let e = match self.re.shortest_match(&buf[start..]) {
|
|
||||||
None => return false,
|
|
||||||
Some(e) => start + e,
|
|
||||||
};
|
|
||||||
let (s, e) = self.find_line(buf, e, e);
|
|
||||||
self.fill_match(mat, s, e);
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fill_match(&self, mat: &mut Match, start: usize, end: usize) {
|
|
||||||
mat.start = start;
|
|
||||||
mat.end = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_line(&self, buf: &[u8], s: usize, e: usize) -> (usize, usize) {
|
|
||||||
(self.find_line_start(buf, s), self.find_line_end(buf, e))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_line_start(&self, buf: &[u8], pos: usize) -> usize {
|
|
||||||
memrchr(self.opts.line_terminator, &buf[0..pos]).map_or(0, |i| i + 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn find_line_end(&self, buf: &[u8], pos: usize) -> usize {
|
|
||||||
memchr(self.opts.line_terminator, &buf[pos..])
|
|
||||||
.map_or(buf.len(), |i| pos + i + 1)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An iterator over all matches in a particular buffer.
|
|
||||||
///
|
|
||||||
/// `'b` refers to the lifetime of the buffer, and `'s` refers to the lifetime
|
|
||||||
/// of the searcher.
|
|
||||||
pub struct Iter<'b, 's> {
|
|
||||||
searcher: &'s Grep,
|
|
||||||
buf: &'b [u8],
|
|
||||||
start: usize,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'b, 's> Iterator for Iter<'b, 's> {
|
|
||||||
type Item = Match;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Match> {
|
|
||||||
let mut mat = Match::default();
|
|
||||||
if !self.searcher.read_match(&mut mat, self.buf, self.start) {
|
|
||||||
self.start = self.buf.len();
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
self.start = mat.end;
|
|
||||||
Some(mat)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use memchr::{memchr, memrchr};
|
|
||||||
use regex::bytes::Regex;
|
|
||||||
|
|
||||||
use super::{GrepBuilder, Match};
|
|
||||||
|
|
||||||
static SHERLOCK: &'static [u8] = include_bytes!("./data/sherlock.txt");
|
|
||||||
|
|
||||||
fn find_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
|
|
||||||
let re = Regex::new(pat).unwrap();
|
|
||||||
let mut lines = vec![];
|
|
||||||
for m in re.find_iter(haystack) {
|
|
||||||
let start = memrchr(b'\n', &haystack[..m.start()])
|
|
||||||
.map_or(0, |i| i + 1);
|
|
||||||
let end = memchr(b'\n', &haystack[m.end()..])
|
|
||||||
.map_or(haystack.len(), |i| m.end() + i + 1);
|
|
||||||
lines.push(Match {
|
|
||||||
start: start,
|
|
||||||
end: end,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
lines
|
|
||||||
}
|
|
||||||
|
|
||||||
fn grep_lines(pat: &str, haystack: &[u8]) -> Vec<Match> {
|
|
||||||
let g = GrepBuilder::new(pat).build().unwrap();
|
|
||||||
g.iter(haystack).collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn buffered_literal() {
|
|
||||||
let expected = find_lines("Sherlock Holmes", SHERLOCK);
|
|
||||||
let got = grep_lines("Sherlock Holmes", SHERLOCK);
|
|
||||||
assert_eq!(expected.len(), got.len());
|
|
||||||
assert_eq!(expected, got);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,191 +0,0 @@
|
|||||||
use syntax::ast::{self, Ast};
|
|
||||||
use syntax::ast::parse::Parser;
|
|
||||||
|
|
||||||
/// The results of analyzing a regex for cased literals.
|
|
||||||
#[derive(Clone, Debug, Default)]
|
|
||||||
pub struct Cased {
|
|
||||||
/// True if and only if a literal uppercase character occurs in the regex.
|
|
||||||
///
|
|
||||||
/// A regex like `\pL` contains no uppercase literals, even though `L`
|
|
||||||
/// is uppercase and the `\pL` class contains uppercase characters.
|
|
||||||
pub any_uppercase: bool,
|
|
||||||
/// True if and only if the regex contains any literal at all. A regex like
|
|
||||||
/// `\pL` has this set to false.
|
|
||||||
pub any_literal: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Cased {
|
|
||||||
/// Returns a `Cased` value by doing analysis on the AST of `pattern`.
|
|
||||||
///
|
|
||||||
/// If `pattern` is not a valid regular expression, then `None` is
|
|
||||||
/// returned.
|
|
||||||
pub fn from_pattern(pattern: &str) -> Option<Cased> {
|
|
||||||
Parser::new()
|
|
||||||
.parse(pattern)
|
|
||||||
.map(|ast| Cased::from_ast(&ast))
|
|
||||||
.ok()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_ast(ast: &Ast) -> Cased {
|
|
||||||
let mut cased = Cased::default();
|
|
||||||
cased.from_ast_impl(ast);
|
|
||||||
cased
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_ast_impl(&mut self, ast: &Ast) {
|
|
||||||
if self.done() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
match *ast {
|
|
||||||
Ast::Empty(_)
|
|
||||||
| Ast::Flags(_)
|
|
||||||
| Ast::Dot(_)
|
|
||||||
| Ast::Assertion(_)
|
|
||||||
| Ast::Class(ast::Class::Unicode(_))
|
|
||||||
| Ast::Class(ast::Class::Perl(_)) => {}
|
|
||||||
Ast::Literal(ref x) => {
|
|
||||||
self.from_ast_literal(x);
|
|
||||||
}
|
|
||||||
Ast::Class(ast::Class::Bracketed(ref x)) => {
|
|
||||||
self.from_ast_class_set(&x.kind);
|
|
||||||
}
|
|
||||||
Ast::Repetition(ref x) => {
|
|
||||||
self.from_ast_impl(&x.ast);
|
|
||||||
}
|
|
||||||
Ast::Group(ref x) => {
|
|
||||||
self.from_ast_impl(&x.ast);
|
|
||||||
}
|
|
||||||
Ast::Alternation(ref alt) => {
|
|
||||||
for x in &alt.asts {
|
|
||||||
self.from_ast_impl(x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ast::Concat(ref alt) => {
|
|
||||||
for x in &alt.asts {
|
|
||||||
self.from_ast_impl(x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_ast_class_set(&mut self, ast: &ast::ClassSet) {
|
|
||||||
if self.done() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
match *ast {
|
|
||||||
ast::ClassSet::Item(ref item) => {
|
|
||||||
self.from_ast_class_set_item(item);
|
|
||||||
}
|
|
||||||
ast::ClassSet::BinaryOp(ref x) => {
|
|
||||||
self.from_ast_class_set(&x.lhs);
|
|
||||||
self.from_ast_class_set(&x.rhs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) {
|
|
||||||
if self.done() {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
match *ast {
|
|
||||||
ast::ClassSetItem::Empty(_)
|
|
||||||
| ast::ClassSetItem::Ascii(_)
|
|
||||||
| ast::ClassSetItem::Unicode(_)
|
|
||||||
| ast::ClassSetItem::Perl(_) => {}
|
|
||||||
ast::ClassSetItem::Literal(ref x) => {
|
|
||||||
self.from_ast_literal(x);
|
|
||||||
}
|
|
||||||
ast::ClassSetItem::Range(ref x) => {
|
|
||||||
self.from_ast_literal(&x.start);
|
|
||||||
self.from_ast_literal(&x.end);
|
|
||||||
}
|
|
||||||
ast::ClassSetItem::Bracketed(ref x) => {
|
|
||||||
self.from_ast_class_set(&x.kind);
|
|
||||||
}
|
|
||||||
ast::ClassSetItem::Union(ref union) => {
|
|
||||||
for x in &union.items {
|
|
||||||
self.from_ast_class_set_item(x);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn from_ast_literal(&mut self, ast: &ast::Literal) {
|
|
||||||
self.any_literal = true;
|
|
||||||
self.any_uppercase = self.any_uppercase || ast.c.is_uppercase();
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if and only if the attributes can never change no matter
|
|
||||||
/// what other AST it might see.
|
|
||||||
fn done(&self) -> bool {
|
|
||||||
self.any_uppercase && self.any_literal
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
fn cased(pattern: &str) -> Cased {
|
|
||||||
Cased::from_pattern(pattern).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn various() {
|
|
||||||
let x = cased("");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(!x.any_literal);
|
|
||||||
|
|
||||||
let x = cased("foo");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased("Foo");
|
|
||||||
assert!(x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased("foO");
|
|
||||||
assert!(x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo\\");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo\w");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo\S");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo\p{Ll}");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo[a-z]");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo[A-Z]");
|
|
||||||
assert!(x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo[\S\t]");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"foo\\S");
|
|
||||||
assert!(x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"\p{Ll}");
|
|
||||||
assert!(!x.any_uppercase);
|
|
||||||
assert!(!x.any_literal);
|
|
||||||
|
|
||||||
let x = cased(r"aBc\w");
|
|
||||||
assert!(x.any_uppercase);
|
|
||||||
assert!(x.any_literal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
use syntax::hir::{self, Hir, HirKind};
|
|
||||||
|
|
||||||
/// Strips Unicode word boundaries from the given expression.
|
|
||||||
///
|
|
||||||
/// The key invariant this maintains is that the expression returned will match
|
|
||||||
/// *at least* every where the expression given will match. Namely, a match of
|
|
||||||
/// the returned expression can report false positives but it will never report
|
|
||||||
/// false negatives.
|
|
||||||
///
|
|
||||||
/// If no word boundaries could be stripped, then None is returned.
|
|
||||||
pub fn strip_unicode_word_boundaries(expr: &Hir) -> Option<Hir> {
|
|
||||||
// The real reason we do this is because Unicode word boundaries are the
|
|
||||||
// one thing that Rust's regex DFA engine can't handle. When it sees a
|
|
||||||
// Unicode word boundary among non-ASCII text, it falls back to one of the
|
|
||||||
// slower engines. We work around this limitation by attempting to use
|
|
||||||
// a regex to find candidate matches without a Unicode word boundary. We'll
|
|
||||||
// only then use the full (and slower) regex to confirm a candidate as a
|
|
||||||
// match or not during search.
|
|
||||||
//
|
|
||||||
// It looks like we only check the outer edges for `\b`? I guess this is
|
|
||||||
// an attempt to optimize for the `-w/--word-regexp` flag? ---AG
|
|
||||||
match *expr.kind() {
|
|
||||||
HirKind::Concat(ref es) if !es.is_empty() => {
|
|
||||||
let first = is_unicode_word_boundary(&es[0]);
|
|
||||||
let last = is_unicode_word_boundary(es.last().unwrap());
|
|
||||||
// Be careful not to strip word boundaries if there are no other
|
|
||||||
// expressions to match.
|
|
||||||
match (first, last) {
|
|
||||||
(true, false) if es.len() > 1 => {
|
|
||||||
Some(Hir::concat(es[1..].to_vec()))
|
|
||||||
}
|
|
||||||
(false, true) if es.len() > 1 => {
|
|
||||||
Some(Hir::concat(es[..es.len() - 1].to_vec()))
|
|
||||||
}
|
|
||||||
(true, true) if es.len() > 2 => {
|
|
||||||
Some(Hir::concat(es[1..es.len() - 1].to_vec()))
|
|
||||||
}
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns true if the given expression is a Unicode word boundary.
|
|
||||||
fn is_unicode_word_boundary(expr: &Hir) -> bool {
|
|
||||||
match *expr.kind() {
|
|
||||||
HirKind::WordBoundary(hir::WordBoundary::Unicode) => true,
|
|
||||||
HirKind::WordBoundary(hir::WordBoundary::UnicodeNegate) => true,
|
|
||||||
HirKind::Group(ref x) => is_unicode_word_boundary(&x.hir),
|
|
||||||
_ => false,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user