Improved binary / minified file detection heuristics

This commit is contained in:
Laurence Withers 2023-05-13 13:17:54 +01:00
parent 5790d3ab5f
commit 2f3af7fc8e
1 changed files with 60 additions and 38 deletions

98
file.go
View File

@ -12,6 +12,20 @@ import (
"unicode/utf8" "unicode/utf8"
) )
const (
// if a matching line is longer than longLine runes then we will print
// a truncated form
longLine = 256
// if any line in the file is detected to be longer than this many runes
// we will count the file as being minified
minifiedLine = 1024
// number of bytes at the start / end of each file to examine for binary
// or minified file detection.
bytesToExamine = 8192
)
type notPlainTextBehaviour int type notPlainTextBehaviour int
const ( const (
@ -56,8 +70,9 @@ func (nptf *notPlainTextFlag) Type() string {
func file(path string, data []byte) { func file(path string, data []byte) {
var short string var short string
isBinary, isMinified := notPlainText(data)
switch { switch {
case isBinary(data): case isBinary:
switch binaryFile.b { switch binaryFile.b {
case notPlainTextShort: case notPlainTextShort:
short = "Binary" short = "Binary"
@ -65,7 +80,7 @@ func file(path string, data []byte) {
return return
} }
case isMinified(data): case isMinified:
switch minifiedFile.b { switch minifiedFile.b {
case notPlainTextShort: case notPlainTextShort:
short = "Minified" short = "Minified"
@ -194,55 +209,62 @@ func file(path string, data []byte) {
} }
} }
func isBinary(data []byte) bool { func notPlainText(data []byte) (isBinary, isMinified bool) {
bytesToExamine := 4096 // examine bytes at the start of the file for binary data
b, m := notPlainTextAux(data)
for bytesToExamine > 0 { if b || len(data) < bytesToExamine*2 {
r, s := utf8.DecodeRune(data) return b, m
switch {
case s == 0:
// end of string
return false
case s == 1 && r == utf8.RuneError:
// invalid UTF-8
return true
case r == '\r', r == '\n', r == '\t':
// valid control chars
case r < ' ':
// invalid control chars
return true
}
data = data[s:]
bytesToExamine -= s
} }
return false // some files, like .a files, have a header which passes plaintext
// detection but we can expect the trailer to be binary
data = data[len(data)-bytesToExamine:]
for i := 0; i < 5; i++ { // attempt to align to UTF-8 char boundary
if utf8.RuneStart(data[0]) {
break
}
data = data[1:]
}
b2, m2 := notPlainTextAux(data)
return (b || b2), (m || m2)
} }
const longLine = 256 func notPlainTextAux(data []byte) (isBinary, isMinified bool) {
n := bytesToExamine
func isMinified(data []byte) bool {
bytesToExamine := 4096
var lineLength int var lineLength int
for n > 0 {
for bytesToExamine > 0 {
r, s := utf8.DecodeRune(data) r, s := utf8.DecodeRune(data)
switch { switch {
case s == 0: case s == 0:
// end of string // end of string
return false return
case s == 1 && r == utf8.RuneError:
// invalid UTF-8
isBinary = true
return
case r == '\n': case r == '\n':
lineLength = 0 // newline
default: lineLength = -1
lineLength++
if lineLength >= longLine { case r == '\r', r == '\t', r == '\v':
return true // valid control chars often present in text
}
case r < ' ':
// control chars not expected in plain text
isBinary = true
return
} }
data = data[s:] data = data[s:]
bytesToExamine -= s n -= s
lineLength++
if lineLength >= minifiedLine {
isMinified = true
}
} }
return false return
} }
func findMatches(data []byte) (loc []int) { func findMatches(data []byte) (loc []int) {