Binary and minified file detection/handling

This commit is contained in:
Laurence Withers 2023-05-13 12:10:05 +01:00
parent 4fd3ae4c7e
commit 8b56504cd7
2 changed files with 274 additions and 146 deletions

266
file.go Normal file
View File

@ -0,0 +1,266 @@
/*
gg is a recursive grep written in Go, with some shortcuts for everyday use.
*/
package main
import (
"bytes"
"errors"
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
type notPlainTextBehaviour int
const (
notPlainTextShort notPlainTextBehaviour = iota
notPlainTextFull
notPlainTextSkip
)
type notPlainTextFlag struct {
b notPlainTextBehaviour
}
func (nptf *notPlainTextFlag) String() string {
switch nptf.b {
case notPlainTextShort:
return "short"
case notPlainTextFull:
return "full"
case notPlainTextSkip:
return "skip"
}
return "???"
}
func (nptf *notPlainTextFlag) Set(s string) error {
switch s {
case "short":
nptf.b = notPlainTextShort
case "full":
nptf.b = notPlainTextFull
case "skip":
nptf.b = notPlainTextSkip
default:
return errors.New("must be one of short|full|skip")
}
return nil
}
func (nptf *notPlainTextFlag) Type() string {
return "short|full|skip" // ???
}
func file(path string, data []byte) {
var short string
switch {
case isBinary(data):
switch binaryFile.b {
case notPlainTextShort:
short = "Binary"
case notPlainTextSkip:
return
}
case isMinified(data):
switch minifiedFile.b {
case notPlainTextShort:
short = "Minified"
case notPlainTextSkip:
return
}
}
var (
lineNum int
printedHeader bool
b, b2 strings.Builder
)
// split into lines
for len(data) > 0 {
eol := bytes.IndexByte(data, '\n')
lineNum++
var line []byte
if eol == -1 {
line = data
data = nil
} else {
line = data[:eol]
data = data[eol+1:]
}
loc := findMatches(line)
if loc == nil {
continue
}
switch {
case !printedHeader && short != "":
if printedFull {
fmt.Println("")
printedFull = false
}
fmt.Printf("%s file %s matches.\n", short, path)
return
case !printedHeader:
if printedFull {
fmt.Println("")
}
printedFull = true
fmt.Println(display.Filename(path))
printedHeader = true
}
b.Reset()
fmt.Fprintf(&b, "%4d: ", display.LineNumber(lineNum))
if loc[0] < 128 {
escape(&b, line[0:loc[0]])
} else {
start := loc[0] - 128
for i := 0; i < 5; i++ {
if utf8.RuneStart(line[start]) {
break
}
start++
}
b.WriteString(display.TruncatedBytes(start).String())
b.WriteString(display.TruncatedMarker().String())
escape(&b, line[start:loc[0]])
}
if loc[1]-loc[0] < 128 {
b2.Reset()
escape(&b2, line[loc[0]:loc[1]])
b.WriteString(display.Match(b2.String()).String())
if loc[1]+128 > len(line) {
escape(&b, line[loc[1]:])
} else {
end := loc[1] + 128
for i := 0; i < 5; i++ {
if utf8.RuneStart(line[end]) {
break
}
end--
}
escape(&b, line[loc[1]:end])
b.WriteString(display.TruncatedBytes(len(line) - end).String())
b.WriteString(display.TruncatedMarker().String())
}
} else {
end := loc[1]
for i := 0; i < 5; i++ {
if utf8.RuneStart(line[end]) {
break
}
end--
}
b2.Reset()
escape(&b2, line[loc[0]:end])
b.WriteString(display.Match(b2.String()).String())
b.WriteString(display.TruncatedMarker().String())
b.WriteString(display.TruncatedBytes(len(line) - end).String())
}
b.WriteRune('\n')
fmt.Print(b.String())
}
}
func isBinary(data []byte) bool {
bytesToExamine := 4096
for bytesToExamine > 0 {
r, s := utf8.DecodeRune(data)
switch {
case s == 0:
// end of string
return false
case s == 1 && r == utf8.RuneError:
// invalid UTF-8
return true
case r == '\r', r == '\n', r == '\t':
// valid control chars
case r < ' ':
// invalid control chars
return true
}
data = data[s:]
bytesToExamine -= s
}
return false
}
func isMinified(data []byte) bool {
const longLine = 256
bytesToExamine := 4096
var lineLength int
for bytesToExamine > 0 {
r, s := utf8.DecodeRune(data)
switch {
case s == 0:
// end of string
return false
case r == '\n':
lineLength = 0
default:
lineLength++
if lineLength >= longLine {
return true
}
}
data = data[s:]
bytesToExamine -= s
}
return false
}
func findMatches(data []byte) (loc []int) {
for _, re := range regexps {
loc := re.FindIndex(data)
if loc != nil {
return loc
}
}
for _, s := range searchBytes {
pos := bytes.Index(data, s)
if pos != -1 {
return []int{pos, pos + len(s)}
}
}
return nil
}
func escape(b *strings.Builder, s []byte) {
for len(s) > 0 {
r, size := utf8.DecodeRune(s)
s = s[size:]
switch {
case r == utf8.RuneError && size == 1:
b.WriteString(display.BadUTF8Char().String())
case r == '\r':
b.WriteString(display.CarriageReturn().String())
case r == '\t',
unicode.IsPrint(r):
b.WriteRune(r)
default:
b.WriteString(display.UnprintableChar().String())
}
}
}

154
main.go
View File

@ -4,15 +4,10 @@ gg is a recursive grep written in Go, with some shortcuts for everyday use.
package main
import (
"bytes"
"errors"
"fmt"
"os"
"path/filepath"
"regexp"
"strings"
"unicode"
"unicode/utf8"
"github.com/spf13/cobra"
"golang.org/x/sys/unix"
@ -20,8 +15,6 @@ import (
// TODO:
// - bold of escaped output doesn't work
// - binary file detection
// - long-line / minified-file detection
// - ignore files by extension (or glob?)
func main() {
@ -61,11 +54,13 @@ var (
searchBytes [][]byte
searchPath []string
ignoreList []string
binaryFile notPlainTextFlag
minifiedFile notPlainTextFlag
ignoreMap map[string]struct{}
ignoreCase bool
noColour bool
display *Display
matchedAny bool
printedFull bool
)
func init() {
@ -74,6 +69,8 @@ func init() {
rootCmd.Flags().StringSliceVarP(&ignoreList, "exclude", "x", []string{".git"}, "files/directories to exclude")
rootCmd.Flags().BoolVarP(&ignoreCase, "ignore-case", "i", false, "make all searches case insensitive")
rootCmd.Flags().BoolVarP(&noColour, "no-colour", "C", false, "disable colour output")
rootCmd.Flags().Var(&binaryFile, "binary", "what to do with binary files")
rootCmd.Flags().Var(&minifiedFile, "minified", "what to do with minified text files")
}
func run(c *cobra.Command, args []string) error {
@ -164,147 +161,12 @@ func search(path string) error {
return err
}
defer f.Close()
fullData, err := unix.Mmap(int(f.Fd()), 0, int(st.Size()), unix.PROT_READ, unix.MAP_PRIVATE)
data, err := unix.Mmap(int(f.Fd()), 0, int(st.Size()), unix.PROT_READ, unix.MAP_PRIVATE)
if err != nil {
return err
}
defer unix.Munmap(fullData)
var printedHeader bool
var (
data = fullData
lineNum int
b, b2 strings.Builder
)
for len(data) > 0 {
eol := bytes.IndexByte(data, '\n')
lineNum++
var line []byte
if eol == -1 {
line = data
data = nil
} else {
line = data[:eol]
data = data[eol+1:]
}
if len(line) == 0 {
continue
}
loc := matches(line)
if loc == nil {
continue
}
if !printedHeader {
printedHeader = true
if !matchedAny {
matchedAny = true
} else {
fmt.Println("")
}
fmt.Println(display.Filename(path))
}
b.Reset()
fmt.Fprintf(&b, "%4d: ", display.LineNumber(lineNum))
if loc[0] < 128 {
escape(&b, line[0:loc[0]])
} else {
start := loc[0] - 128
for i := 0; i < 5; i++ {
if utf8.RuneStart(line[start]) {
break
}
start++
}
b.WriteString(display.TruncatedBytes(start).String())
b.WriteString(display.TruncatedMarker().String())
escape(&b, line[start:loc[0]])
}
if loc[1]-loc[0] < 128 {
b2.Reset()
escape(&b2, line[loc[0]:loc[1]])
b.WriteString(display.Match(b2.String()).String())
if loc[1]+128 > len(line) {
escape(&b, line[loc[1]:])
} else {
end := loc[1] + 128
for i := 0; i < 5; i++ {
if utf8.RuneStart(line[end]) {
break
}
end--
}
escape(&b, line[loc[1]:end])
b.WriteString(display.TruncatedBytes(len(line) - end).String())
b.WriteString(display.TruncatedMarker().String())
}
} else {
end := loc[1]
for i := 0; i < 5; i++ {
if utf8.RuneStart(line[end]) {
break
}
end--
}
b2.Reset()
escape(&b2, line[loc[0]:end])
b.WriteString(display.Match(b2.String()).String())
b.WriteString(display.TruncatedMarker().String())
b.WriteString(display.TruncatedBytes(len(line) - end).String())
}
b.WriteRune('\n')
fmt.Print(b.String())
}
defer unix.Munmap(data)
file(path, data)
return nil
}
func matches(data []byte) (loc []int) {
for _, re := range regexps {
loc := re.FindIndex(data)
if loc != nil {
return loc
}
}
for _, s := range searchBytes {
pos := bytes.Index(data, s)
if pos != -1 {
return []int{pos, pos + len(s)}
}
}
return nil
}
func escape(b *strings.Builder, s []byte) {
for len(s) > 0 {
r, size := utf8.DecodeRune(s)
s = s[size:]
switch {
case r == utf8.RuneError && size == 1:
b.WriteString(display.BadUTF8Char().String())
case r == '\r':
b.WriteString(display.CarriageReturn().String())
case r == '\t',
unicode.IsPrint(r):
b.WriteRune(r)
default:
b.WriteString(display.UnprintableChar().String())
}
}
}