Compare commits

...

8 Commits
v0.1.0 ... main

Author SHA1 Message Date
Laurence Withers 294a41b736 Flesh out README 2023-07-07 11:50:00 +01:00
Laurence Withers f6cd64cf3b Add -I include option, and some usage examples 2023-07-07 11:49:54 +01:00
Laurence Withers eacffb4fe1 Add -L symlink follow option, default false 2023-07-07 11:22:06 +01:00
Laurence Withers 5694cc5194 Move printedFull closer to its use, document 2023-07-07 11:21:41 +01:00
Laurence Withers 67f81d2728 Silence usage message on runtime error 2023-07-07 11:10:31 +01:00
Laurence Withers d2cf57dcd9 Use regexp for case-insensitive literal matches
This commit switches back to using the regexp engine for
case-insensitive literal string matches.

This is slower, but at least case-insensitive matches for string
literals will function now. The code is a tiny bit shorter and simpler
too.

Given the aim of the tool is to be useful for ad-hoc searches,
efficiency isn't the concern but rather just getting the job done with
the minimum of fuss / unexpected behaviour.
2023-07-07 11:00:02 +01:00
Laurence Withers 2f3af7fc8e Improved binary / minified file detection heuristics 2023-05-13 13:17:54 +01:00
Laurence Withers 5790d3ab5f Make exclude match on suffices by default 2023-05-13 13:13:18 +01:00
3 changed files with 238 additions and 80 deletions

View File

@ -1,3 +1,44 @@
# gg # gg
Recursive grep written in Go, for everyday ease of use. Recursive grep written in Go, for everyday ease of use.
`gg` is a recursive grep. Given a regexp (or fixed pattern) it will search for
the pattern recursively in the current working directory. It will print a
coloured header per file along with the matching line and pattern.
It is possible to scan specific files or directories, rather than the default
current working directory. To do this, simply specify the path(s) as arguments
following the pattern.
It is possible to scan for multiple patterns using the `-e` (or `-Q`) argument,
which can be repeated multiple times. `-e` specifies a regular expression and
`-Q` a fixed pattern. When using either flag, any non-flag arguments are treated
as paths to scan.
Search defaults to case-sensitive but the `-i` flag may be passed to make all
search terms case-insensitive. Alternatively, the `"(?i)"` construct may be added
to a regular expression to make that specific expression case insensitive.
Files and directories can be excluded with the `-x` option. This supports bash-style
globs with `'*'`, `'?'`, `'[a-z]'`, `'{this,that}'`, or `'/**/'` to match zero or more
directories. By default, `.git` and vim swap files are ignored. Similarly, `-I`
filters files to include. Examples:
```
# ignore files/dirs with .js or .css suffix
gg -x '*.js' -x '*.css' pattern
# only match files with .go suffix (any subdir)
gg -I '*.go' pattern
# only match files whose parent dir is "stuff", but ignore "foo" subdir
gg -x ./foo -I 'stuff/*' pattern
# only match .js files with a directory "things" in the path, but ignore
# .min.js (e.g. will match "foo/things/bar/my.js")
gg -I 'things/**/*.js' -x '*.min.js' pattern
```
Symlinks named on the command line are followed, but by default symlinks are
not followed when recursing into directories. `-L` allows them to be
dereferenced.

115
file.go
View File

@ -12,6 +12,20 @@ import (
"unicode/utf8" "unicode/utf8"
) )
const (
// if a matching line is longer than longLine runes then we will print
// a truncated form
longLine = 256
// if any line in the file is detected to be longer than this many runes
// we will count the file as being minified
minifiedLine = 1024
// number of bytes at the start / end of each file to examine for binary
// or minified file detection.
bytesToExamine = 8192
)
type notPlainTextBehaviour int type notPlainTextBehaviour int
const ( const (
@ -54,10 +68,20 @@ func (nptf *notPlainTextFlag) Type() string {
return "short|full|skip" // ??? return "short|full|skip" // ???
} }
var (
// printedFull is set by file() if it prints a full file's matches
// (i.e. a header line, then the match lines). It is cleared if we
// just printed a "Binary file <foo> matches" line. It lets us have a
// nice one-line separator between full files, but show the binary
// matches compactly.
printedFull bool
)
func file(path string, data []byte) { func file(path string, data []byte) {
var short string var short string
isBinary, isMinified := notPlainText(data)
switch { switch {
case isBinary(data): case isBinary:
switch binaryFile.b { switch binaryFile.b {
case notPlainTextShort: case notPlainTextShort:
short = "Binary" short = "Binary"
@ -65,7 +89,7 @@ func file(path string, data []byte) {
return return
} }
case isMinified(data): case isMinified:
switch minifiedFile.b { switch minifiedFile.b {
case notPlainTextShort: case notPlainTextShort:
short = "Minified" short = "Minified"
@ -194,55 +218,62 @@ func file(path string, data []byte) {
} }
} }
func isBinary(data []byte) bool { func notPlainText(data []byte) (isBinary, isMinified bool) {
bytesToExamine := 4096 // examine bytes at the start of the file for binary data
b, m := notPlainTextAux(data)
if b || len(data) < bytesToExamine*2 {
return b, m
}
for bytesToExamine > 0 { // some files, like .a files, have a header which passes plaintext
// detection but we can expect the trailer to be binary
data = data[len(data)-bytesToExamine:]
for i := 0; i < 5; i++ { // attempt to align to UTF-8 char boundary
if utf8.RuneStart(data[0]) {
break
}
data = data[1:]
}
b2, m2 := notPlainTextAux(data)
return (b || b2), (m || m2)
}
func notPlainTextAux(data []byte) (isBinary, isMinified bool) {
n := bytesToExamine
var lineLength int
for n > 0 {
r, s := utf8.DecodeRune(data) r, s := utf8.DecodeRune(data)
switch { switch {
case s == 0: case s == 0:
// end of string // end of string
return false return
case s == 1 && r == utf8.RuneError: case s == 1 && r == utf8.RuneError:
// invalid UTF-8 // invalid UTF-8
return true isBinary = true
case r == '\r', r == '\n', r == '\t': return
// valid control chars
case r < ' ':
// invalid control chars
return true
}
data = data[s:]
bytesToExamine -= s
}
return false
}
const longLine = 256
func isMinified(data []byte) bool {
bytesToExamine := 4096
var lineLength int
for bytesToExamine > 0 {
r, s := utf8.DecodeRune(data)
switch {
case s == 0:
// end of string
return false
case r == '\n': case r == '\n':
lineLength = 0 // newline
default: lineLength = -1
lineLength++
if lineLength >= longLine { case r == '\r', r == '\t', r == '\v':
return true // valid control chars often present in text
}
case r < ' ':
// control chars not expected in plain text
isBinary = true
return
} }
data = data[s:] data = data[s:]
bytesToExamine -= s n -= s
lineLength++
if lineLength >= minifiedLine {
isMinified = true
} }
return false }
return
} }
func findMatches(data []byte) (loc []int) { func findMatches(data []byte) (loc []int) {
@ -252,12 +283,6 @@ func findMatches(data []byte) (loc []int) {
return loc return loc
} }
} }
for _, s := range searchBytes {
pos := bytes.Index(data, s)
if pos != -1 {
return []int{pos, pos + len(s)}
}
}
return nil return nil
} }

148
main.go
View File

@ -9,6 +9,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"regexp" "regexp"
"strings"
"github.com/bmatcuk/doublestar/v4" "github.com/bmatcuk/doublestar/v4"
"github.com/spf13/cobra" "github.com/spf13/cobra"
@ -16,7 +17,6 @@ import (
) )
// TODO: // TODO:
// - it would be better to make fixed patterns case insensitive too.
// - configurable defaults for exclude. // - configurable defaults for exclude.
func main() { func main() {
@ -41,46 +41,73 @@ which can be repeated multiple times. -e specifies a regular expression and
-Q a fixed pattern. When using either flag, any non-flag arguments are treated -Q a fixed pattern. When using either flag, any non-flag arguments are treated
as paths to scan. as paths to scan.
Search defaults to case-sensitive but the -i flag may be passed to make regular Search defaults to case-sensitive but the -i flag may be passed to make all
expression searches case-insensitive. Alternatively, the "(?i)" construct may be search terms case-insensitive. Alternatively, the "(?i)" construct may be added
added to a regular expression to make that specific expression case insensitive. to a regular expression to make that specific expression case insensitive.
Fixed pattern matches are always case-sensitive.
Files and directories can be excluded with the -x option. This supports bash-style Files and directories can be excluded with the -x option. This supports bash-style
globs with '*', '?', '[a-z]', '{this,that}', or '/**/' to match zero or more globs with '*', '?', '[a-z]', '{this,that}', or '/**/' to match zero or more
directories. By default, .git and vim swap files are ignored.`, directories. By default, .git and vim swap files are ignored. Similarly, -I
filters files to include. Examples:
# ignore files/dirs with .js or .css suffix
gg -x '*.js' -x '*.css' pattern
# only match files with .go suffix (any subdir)
gg -I '*.go' pattern
# only match files whose parent dir is "stuff", but ignore "foo" subdir
gg -x ./foo -I 'stuff/*' pattern
# only match .js files with a directory "things" in the path, but ignore
# .min.js (e.g. will match "foo/things/bar/my.js")
gg -I 'things/**/*.js' -x '*.min.js' pattern
Symlinks named on the command line are followed, but by default symlinks are
not followed when recursing into directories. -L allows them to be
dereferenced.`,
RunE: run, RunE: run,
} }
var ( var (
// flags
searchRegexp []string searchRegexp []string
regexps []*regexp.Regexp
searchFixed []string searchFixed []string
searchBytes [][]byte
searchPath []string searchPath []string
excludeList []string excludeList []string
binaryFile notPlainTextFlag includeList []string
minifiedFile notPlainTextFlag
ignoreCase bool ignoreCase bool
noColour bool noColour bool
binaryFile notPlainTextFlag
minifiedFile notPlainTextFlag
followSymlinks bool
// computed from searchRegexp, searchFixed. Each regexp here will be
// matched against each line of each input file.
regexps []*regexp.Regexp
// formats output
display *Display display *Display
printedFull bool
) )
func init() { func init() {
rootCmd.Flags().StringSliceVarP(&searchRegexp, "grep", "e", nil, "pattern to match (regular expression)") rootCmd.Flags().StringSliceVarP(&searchRegexp, "grep", "e", nil, "pattern to match (regular expression)")
rootCmd.Flags().StringSliceVarP(&searchFixed, "fixed", "Q", nil, "pattern to match (fixed string)") rootCmd.Flags().StringSliceVarP(&searchFixed, "fixed", "Q", nil, "pattern to match (fixed string)")
rootCmd.Flags().StringSliceVarP(&excludeList, "exclude", "x", []string{".git", ".*.swp"}, "files/directories to exclude") rootCmd.Flags().StringSliceVarP(&excludeList, "exclude", "x", []string{".git", ".*.swp"}, "files/directories to exclude")
rootCmd.Flags().StringSliceVarP(&includeList, "include", "I", nil, "files/directories to include")
rootCmd.Flags().BoolVarP(&ignoreCase, "ignore-case", "i", false, "make all searches case insensitive") rootCmd.Flags().BoolVarP(&ignoreCase, "ignore-case", "i", false, "make all searches case insensitive")
rootCmd.Flags().BoolVarP(&noColour, "no-colour", "C", false, "disable colour output") rootCmd.Flags().BoolVarP(&noColour, "no-colour", "C", false, "disable colour output")
rootCmd.Flags().Var(&binaryFile, "binary", "what to do with binary files") rootCmd.Flags().Var(&binaryFile, "binary", "what to do with binary files")
rootCmd.Flags().Var(&minifiedFile, "minified", "what to do with minified text files") rootCmd.Flags().Var(&minifiedFile, "minified", "what to do with minified text files")
rootCmd.Flags().BoolVarP(&followSymlinks, "dereference", "L", false, "follow symlinks when recursing")
} }
func run(c *cobra.Command, args []string) error { func run(c *cobra.Command, args []string) error {
display = NewDisplay(noColour) display = NewDisplay(noColour)
// if no -e or -Q flag is passed, then the first arg is taken to be
// the pattern to match
if len(searchRegexp) == 0 && len(searchFixed) == 0 { if len(searchRegexp) == 0 && len(searchFixed) == 0 {
if len(args) == 0 { if len(args) == 0 {
return errors.New("no pattern specified") return errors.New("no pattern specified")
@ -89,17 +116,38 @@ func run(c *cobra.Command, args []string) error {
args = args[1:] args = args[1:]
} }
// remaining arguments are treated as search paths; an empty list is
// taken to mean the CWD
searchPath = args searchPath = args
if len(searchPath) == 0 { if len(searchPath) == 0 {
searchPath = append(searchPath, ".") searchPath = append(searchPath, ".")
} }
for _, x := range excludeList { // if we got past argument passing, then returned errors are runtime
// things (like file not found) that shouldn't trigger a usage message.
c.SilenceUsage = true
// for -x and -I, an undecorated pattern is treated as a suffix match
for i, x := range excludeList {
if !strings.HasPrefix(x, "**/") && !strings.HasPrefix(x, "./") {
x = "**/" + x
excludeList[i] = x
}
if !doublestar.ValidatePattern(x) { if !doublestar.ValidatePattern(x) {
return fmt.Errorf("invalid exclude pattern %q", x) return fmt.Errorf("invalid exclude pattern %q", x)
} }
} }
for i, x := range includeList {
if !strings.HasPrefix(x, "**/") && !strings.HasPrefix(x, "./") {
x = "**/" + x
includeList[i] = x
}
if !doublestar.ValidatePattern(x) {
return fmt.Errorf("invalid include pattern %q", x)
}
}
// compile regular expressions for matching
for _, r := range searchRegexp { for _, r := range searchRegexp {
if ignoreCase { if ignoreCase {
r = "(?i)" + r r = "(?i)" + r
@ -110,14 +158,21 @@ func run(c *cobra.Command, args []string) error {
} }
regexps = append(regexps, re) regexps = append(regexps, re)
} }
for _, r := range searchFixed {
for _, s := range searchFixed { r = regexp.QuoteMeta(r)
searchBytes = append(searchBytes, []byte(s)) if ignoreCase {
r = "(?i)" + r
}
re, err := regexp.Compile(r)
if err != nil {
}
regexps = append(regexps, re)
} }
// search over named paths
var errs []error var errs []error
for _, path := range searchPath { for _, path := range searchPath {
if err := search(path); err != nil { if err := search(path, true); err != nil {
errs = append(errs, err) errs = append(errs, err)
} }
} }
@ -132,26 +187,63 @@ func recurse(path string) error {
} }
var errs []error var errs []error
NextFile:
for _, de := range d { for _, de := range d {
name := de.Name() fullPath := filepath.Join(path, de.Name())
fullPath := filepath.Join(path, name) if !shouldSearch(fullPath, de.IsDir()) {
continue
for _, x := range excludeList {
if exclude, _ := doublestar.Match(x, fullPath); exclude {
continue NextFile
} }
} if err := search(fullPath, followSymlinks); err != nil {
if err := search(fullPath); err != nil {
errs = append(errs, err) errs = append(errs, err)
} }
} }
return errors.Join(errs...) return errors.Join(errs...)
} }
func search(path string) error { // shouldSearch matches the full path of the file against the include and
st, err := os.Stat(path) // exclude lists, returning true if we should consider the file/directory for
// searching and false if not.
func shouldSearch(fullPath string, isDir bool) bool {
// process the exclude list first
for _, x := range excludeList {
if exclude, _ := doublestar.Match(x, fullPath); exclude {
return false
}
}
// if the include list is empty, everything is included
if len(includeList) == 0 {
return true
}
for _, x := range includeList {
match, _ := doublestar.Match(x, fullPath)
fmt.Printf("[DEBUG] x=%q fullPath=%q isDir=%t match=%t\n", x, fullPath, isDir, match)
// if it's a directory, and we have at least one recursive
// matcher, then search
if isDir && strings.HasPrefix(x, "**/") {
return true
}
if include, _ := doublestar.Match(x, fullPath); include {
return true
}
}
return false
}
func search(path string, deref bool) error {
var (
st os.FileInfo
err error
)
if deref {
st, err = os.Stat(path)
} else {
st, err = os.Lstat(path)
}
if err != nil { if err != nil {
return err return err
} }