Compare commits

...

8 Commits
v0.1.0 ... main

Author SHA1 Message Date
Laurence Withers 294a41b736 Flesh out README 2023-07-07 11:50:00 +01:00
Laurence Withers f6cd64cf3b Add -I include option, and some usage examples 2023-07-07 11:49:54 +01:00
Laurence Withers eacffb4fe1 Add -L symlink follow option, default false 2023-07-07 11:22:06 +01:00
Laurence Withers 5694cc5194 Move printedFull closer to its use, document 2023-07-07 11:21:41 +01:00
Laurence Withers 67f81d2728 Silence usage message on runtime error 2023-07-07 11:10:31 +01:00
Laurence Withers d2cf57dcd9 Use regexp for case-insensitive literal matches
This commit switches back to using the regexp engine for
case-insensitive literal string matches.

This is slower, but at least case-insensitive matches for string
literals will function now. The code is a tiny bit shorter and simpler
too.

Given the aim of the tool is to be useful for ad-hoc searches,
efficiency isn't the concern but rather just getting the job done with
the minimum of fuss / unexpected behaviour.
2023-07-07 11:00:02 +01:00
Laurence Withers 2f3af7fc8e Improved binary / minified file detection heuristics 2023-05-13 13:17:54 +01:00
Laurence Withers 5790d3ab5f Make exclude match on suffices by default 2023-05-13 13:13:18 +01:00
3 changed files with 238 additions and 80 deletions

View File

@ -1,3 +1,44 @@
# gg
Recursive grep written in Go, for everyday ease of use.
Recursive grep written in Go, for everyday ease of use.
`gg` is a recursive grep. Given a regexp (or fixed pattern) it will search for
the pattern recursively in the current working directory. It will print a
coloured header per file along with the matching line and pattern.
It is possible to scan specific files or directories, rather than the default
current working directory. To do this, simply specify the path(s) as arguments
following the pattern.
It is possible to scan for multiple patterns using the `-e` (or `-Q`) argument,
which can be repeated multiple times. `-e` specifies a regular expression and
`-Q` a fixed pattern. When using either flag, any non-flag arguments are treated
as paths to scan.
Search defaults to case-sensitive but the `-i` flag may be passed to make all
search terms case-insensitive. Alternatively, the `"(?i)"` construct may be added
to a regular expression to make that specific expression case insensitive.
Files and directories can be excluded with the `-x` option. This supports bash-style
globs with `'*'`, `'?'`, `'[a-z]'`, `'{this,that}'`, or `'/**/'` to match zero or more
directories. By default, `.git` and vim swap files are ignored. Similarly, `-I`
filters files to include. Examples:
```
# ignore files/dirs with .js or .css suffix
gg -x '*.js' -x '*.css' pattern
# only match files with .go suffix (any subdir)
gg -I '*.go' pattern
# only match files whose parent dir is "stuff", but ignore "foo" subdir
gg -x ./foo -I 'stuff/*' pattern
# only match .js files with a directory "things" in the path, but ignore
# .min.js (e.g. will match "foo/things/bar/my.js")
gg -I 'things/**/*.js' -x '*.min.js' pattern
```
Symlinks named on the command line are followed, but by default symlinks are
not followed when recursing into directories. `-L` allows them to be
dereferenced.

113
file.go
View File

@ -12,6 +12,20 @@ import (
"unicode/utf8"
)
const (
// if a matching line is longer than longLine runes then we will print
// a truncated form
longLine = 256
// if any line in the file is detected to be longer than this many runes
// we will count the file as being minified
minifiedLine = 1024
// number of bytes at the start / end of each file to examine for binary
// or minified file detection.
bytesToExamine = 8192
)
type notPlainTextBehaviour int
const (
@ -54,10 +68,20 @@ func (nptf *notPlainTextFlag) Type() string {
return "short|full|skip" // ???
}
var (
// printedFull is set by file() if it prints a full file's matches
// (i.e. a header line, then the match lines). It is cleared if we
// just printed a "Binary file <foo> matches" line. It lets us have a
// nice one-line separator between full files, but show the binary
// matches compactly.
printedFull bool
)
func file(path string, data []byte) {
var short string
isBinary, isMinified := notPlainText(data)
switch {
case isBinary(data):
case isBinary:
switch binaryFile.b {
case notPlainTextShort:
short = "Binary"
@ -65,7 +89,7 @@ func file(path string, data []byte) {
return
}
case isMinified(data):
case isMinified:
switch minifiedFile.b {
case notPlainTextShort:
short = "Minified"
@ -194,55 +218,62 @@ func file(path string, data []byte) {
}
}
func isBinary(data []byte) bool {
bytesToExamine := 4096
for bytesToExamine > 0 {
r, s := utf8.DecodeRune(data)
switch {
case s == 0:
// end of string
return false
case s == 1 && r == utf8.RuneError:
// invalid UTF-8
return true
case r == '\r', r == '\n', r == '\t':
// valid control chars
case r < ' ':
// invalid control chars
return true
}
data = data[s:]
bytesToExamine -= s
func notPlainText(data []byte) (isBinary, isMinified bool) {
// examine bytes at the start of the file for binary data
b, m := notPlainTextAux(data)
if b || len(data) < bytesToExamine*2 {
return b, m
}
return false
// some files, like .a files, have a header which passes plaintext
// detection but we can expect the trailer to be binary
data = data[len(data)-bytesToExamine:]
for i := 0; i < 5; i++ { // attempt to align to UTF-8 char boundary
if utf8.RuneStart(data[0]) {
break
}
data = data[1:]
}
b2, m2 := notPlainTextAux(data)
return (b || b2), (m || m2)
}
const longLine = 256
func isMinified(data []byte) bool {
bytesToExamine := 4096
func notPlainTextAux(data []byte) (isBinary, isMinified bool) {
n := bytesToExamine
var lineLength int
for bytesToExamine > 0 {
for n > 0 {
r, s := utf8.DecodeRune(data)
switch {
case s == 0:
// end of string
return false
return
case s == 1 && r == utf8.RuneError:
// invalid UTF-8
isBinary = true
return
case r == '\n':
lineLength = 0
default:
lineLength++
if lineLength >= longLine {
return true
}
// newline
lineLength = -1
case r == '\r', r == '\t', r == '\v':
// valid control chars often present in text
case r < ' ':
// control chars not expected in plain text
isBinary = true
return
}
data = data[s:]
bytesToExamine -= s
n -= s
lineLength++
if lineLength >= minifiedLine {
isMinified = true
}
}
return false
return
}
func findMatches(data []byte) (loc []int) {
@ -252,12 +283,6 @@ func findMatches(data []byte) (loc []int) {
return loc
}
}
for _, s := range searchBytes {
pos := bytes.Index(data, s)
if pos != -1 {
return []int{pos, pos + len(s)}
}
}
return nil
}

162
main.go
View File

@ -9,6 +9,7 @@ import (
"os"
"path/filepath"
"regexp"
"strings"
"github.com/bmatcuk/doublestar/v4"
"github.com/spf13/cobra"
@ -16,7 +17,6 @@ import (
)
// TODO:
// - it would be better to make fixed patterns case insensitive too.
// - configurable defaults for exclude.
func main() {
@ -41,46 +41,73 @@ which can be repeated multiple times. -e specifies a regular expression and
-Q a fixed pattern. When using either flag, any non-flag arguments are treated
as paths to scan.
Search defaults to case-sensitive but the -i flag may be passed to make regular
expression searches case-insensitive. Alternatively, the "(?i)" construct may be
added to a regular expression to make that specific expression case insensitive.
Fixed pattern matches are always case-sensitive.
Search defaults to case-sensitive but the -i flag may be passed to make all
search terms case-insensitive. Alternatively, the "(?i)" construct may be added
to a regular expression to make that specific expression case insensitive.
Files and directories can be excluded with the -x option. This supports bash-style
globs with '*', '?', '[a-z]', '{this,that}', or '/**/' to match zero or more
directories. By default, .git and vim swap files are ignored.`,
directories. By default, .git and vim swap files are ignored. Similarly, -I
filters files to include. Examples:
# ignore files/dirs with .js or .css suffix
gg -x '*.js' -x '*.css' pattern
# only match files with .go suffix (any subdir)
gg -I '*.go' pattern
# only match files whose parent dir is "stuff", but ignore "foo" subdir
gg -x ./foo -I 'stuff/*' pattern
# only match .js files with a directory "things" in the path, but ignore
# .min.js (e.g. will match "foo/things/bar/my.js")
gg -I 'things/**/*.js' -x '*.min.js' pattern
Symlinks named on the command line are followed, but by default symlinks are
not followed when recursing into directories. -L allows them to be
dereferenced.`,
RunE: run,
}
var (
searchRegexp []string
regexps []*regexp.Regexp
searchFixed []string
searchBytes [][]byte
searchPath []string
excludeList []string
binaryFile notPlainTextFlag
minifiedFile notPlainTextFlag
ignoreCase bool
noColour bool
display *Display
printedFull bool
// flags
searchRegexp []string
searchFixed []string
searchPath []string
excludeList []string
includeList []string
ignoreCase bool
noColour bool
binaryFile notPlainTextFlag
minifiedFile notPlainTextFlag
followSymlinks bool
// computed from searchRegexp, searchFixed. Each regexp here will be
// matched against each line of each input file.
regexps []*regexp.Regexp
// formats output
display *Display
)
func init() {
rootCmd.Flags().StringSliceVarP(&searchRegexp, "grep", "e", nil, "pattern to match (regular expression)")
rootCmd.Flags().StringSliceVarP(&searchFixed, "fixed", "Q", nil, "pattern to match (fixed string)")
rootCmd.Flags().StringSliceVarP(&excludeList, "exclude", "x", []string{".git", ".*.swp"}, "files/directories to exclude")
rootCmd.Flags().StringSliceVarP(&includeList, "include", "I", nil, "files/directories to include")
rootCmd.Flags().BoolVarP(&ignoreCase, "ignore-case", "i", false, "make all searches case insensitive")
rootCmd.Flags().BoolVarP(&noColour, "no-colour", "C", false, "disable colour output")
rootCmd.Flags().Var(&binaryFile, "binary", "what to do with binary files")
rootCmd.Flags().Var(&minifiedFile, "minified", "what to do with minified text files")
rootCmd.Flags().BoolVarP(&followSymlinks, "dereference", "L", false, "follow symlinks when recursing")
}
func run(c *cobra.Command, args []string) error {
display = NewDisplay(noColour)
// if no -e or -Q flag is passed, then the first arg is taken to be
// the pattern to match
if len(searchRegexp) == 0 && len(searchFixed) == 0 {
if len(args) == 0 {
return errors.New("no pattern specified")
@ -89,17 +116,38 @@ func run(c *cobra.Command, args []string) error {
args = args[1:]
}
// remaining arguments are treated as search paths; an empty list is
// taken to mean the CWD
searchPath = args
if len(searchPath) == 0 {
searchPath = append(searchPath, ".")
}
for _, x := range excludeList {
// if we got past argument passing, then returned errors are runtime
// things (like file not found) that shouldn't trigger a usage message.
c.SilenceUsage = true
// for -x and -I, an undecorated pattern is treated as a suffix match
for i, x := range excludeList {
if !strings.HasPrefix(x, "**/") && !strings.HasPrefix(x, "./") {
x = "**/" + x
excludeList[i] = x
}
if !doublestar.ValidatePattern(x) {
return fmt.Errorf("invalid exclude pattern %q", x)
}
}
for i, x := range includeList {
if !strings.HasPrefix(x, "**/") && !strings.HasPrefix(x, "./") {
x = "**/" + x
includeList[i] = x
}
if !doublestar.ValidatePattern(x) {
return fmt.Errorf("invalid include pattern %q", x)
}
}
// compile regular expressions for matching
for _, r := range searchRegexp {
if ignoreCase {
r = "(?i)" + r
@ -110,14 +158,21 @@ func run(c *cobra.Command, args []string) error {
}
regexps = append(regexps, re)
}
for _, s := range searchFixed {
searchBytes = append(searchBytes, []byte(s))
for _, r := range searchFixed {
r = regexp.QuoteMeta(r)
if ignoreCase {
r = "(?i)" + r
}
re, err := regexp.Compile(r)
if err != nil {
}
regexps = append(regexps, re)
}
// search over named paths
var errs []error
for _, path := range searchPath {
if err := search(path); err != nil {
if err := search(path, true); err != nil {
errs = append(errs, err)
}
}
@ -132,26 +187,63 @@ func recurse(path string) error {
}
var errs []error
NextFile:
for _, de := range d {
name := de.Name()
fullPath := filepath.Join(path, name)
for _, x := range excludeList {
if exclude, _ := doublestar.Match(x, fullPath); exclude {
continue NextFile
}
fullPath := filepath.Join(path, de.Name())
if !shouldSearch(fullPath, de.IsDir()) {
continue
}
if err := search(fullPath); err != nil {
if err := search(fullPath, followSymlinks); err != nil {
errs = append(errs, err)
}
}
return errors.Join(errs...)
}
func search(path string) error {
st, err := os.Stat(path)
// shouldSearch matches the full path of the file against the include and
// exclude lists, returning true if we should consider the file/directory for
// searching and false if not.
func shouldSearch(fullPath string, isDir bool) bool {
// process the exclude list first
for _, x := range excludeList {
if exclude, _ := doublestar.Match(x, fullPath); exclude {
return false
}
}
// if the include list is empty, everything is included
if len(includeList) == 0 {
return true
}
for _, x := range includeList {
match, _ := doublestar.Match(x, fullPath)
fmt.Printf("[DEBUG] x=%q fullPath=%q isDir=%t match=%t\n", x, fullPath, isDir, match)
// if it's a directory, and we have at least one recursive
// matcher, then search
if isDir && strings.HasPrefix(x, "**/") {
return true
}
if include, _ := doublestar.Match(x, fullPath); include {
return true
}
}
return false
}
func search(path string, deref bool) error {
var (
st os.FileInfo
err error
)
if deref {
st, err = os.Stat(path)
} else {
st, err = os.Lstat(path)
}
if err != nil {
return err
}