Improved binary / minified file detection heuristics

2023-05-13 13:17:54 +01:00 · 2023-05-13 13:17:54 +01:00 · 2f3af7fc8e
parent 5790d3ab5f
commit 2f3af7fc8e
1 changed files with 60 additions and 38 deletions
--- a/file.go
+++ b/file.go
@ -12,6 +12,20 @@ import (
 	"unicode/utf8"
 )

+const (
+	// if a matching line is longer than longLine runes then we will print
+	// a truncated form
+	longLine = 256
+
+	// if any line in the file is detected to be longer than this many runes
+	// we will count the file as being minified
+	minifiedLine = 1024
+
+	// number of bytes at the start / end of each file to examine for binary
+	// or minified file detection.
+	bytesToExamine = 8192
+)
+
 type notPlainTextBehaviour int

 const (
@ -56,8 +70,9 @@ func (nptf *notPlainTextFlag) Type() string {

 func file(path string, data []byte) {
 	var short string
+	isBinary, isMinified := notPlainText(data)
 	switch {
-	case isBinary(data):
+	case isBinary:
 		switch binaryFile.b {
 		case notPlainTextShort:
 			short = "Binary"
@ -65,7 +80,7 @@ func file(path string, data []byte) {
 			return
 		}

-	case isMinified(data):
+	case isMinified:
 		switch minifiedFile.b {
 		case notPlainTextShort:
 			short = "Minified"
@ -194,55 +209,62 @@ func file(path string, data []byte) {
 	}
 }

-func isBinary(data []byte) bool {
-	bytesToExamine := 4096
-
-	for bytesToExamine > 0 {
-		r, s := utf8.DecodeRune(data)
-		switch {
-		case s == 0:
-			// end of string
-			return false
-		case s == 1 && r == utf8.RuneError:
-			// invalid UTF-8
-			return true
-		case r == '\r', r == '\n', r == '\t':
-			// valid control chars
-		case r < ' ':
-			// invalid control chars
-			return true
-		}
-		data = data[s:]
-		bytesToExamine -= s
+func notPlainText(data []byte) (isBinary, isMinified bool) {
+	// examine bytes at the start of the file for binary data
+	b, m := notPlainTextAux(data)
+	if b || len(data) < bytesToExamine*2 {
+		return b, m
 	}

-	return false
+	// some files, like .a files, have a header which passes plaintext
+	// detection but we can expect the trailer to be binary
+	data = data[len(data)-bytesToExamine:]
+	for i := 0; i < 5; i++ { // attempt to align to UTF-8 char boundary
+		if utf8.RuneStart(data[0]) {
+			break
+		}
+		data = data[1:]
+	}
+	b2, m2 := notPlainTextAux(data)
+	return (b || b2), (m || m2)
 }

-const longLine = 256
-
-func isMinified(data []byte) bool {
-	bytesToExamine := 4096
+func notPlainTextAux(data []byte) (isBinary, isMinified bool) {
+	n := bytesToExamine
 	var lineLength int
-
-	for bytesToExamine > 0 {
+	for n > 0 {
 		r, s := utf8.DecodeRune(data)
 		switch {
 		case s == 0:
 			// end of string
-			return false
+			return
+
+		case s == 1 && r == utf8.RuneError:
+			// invalid UTF-8
+			isBinary = true
+			return
+
 		case r == '\n':
-			lineLength = 0
-		default:
-			lineLength++
-			if lineLength >= longLine {
-				return true
-			}
+			// newline
+			lineLength = -1
+
+		case r == '\r', r == '\t', r == '\v':
+			// valid control chars often present in text
+
+		case r < ' ':
+			// control chars not expected in plain text
+			isBinary = true
+			return
 		}
+
 		data = data[s:]
-		bytesToExamine -= s
+		n -= s
+		lineLength++
+		if lineLength >= minifiedLine {
+			isMinified = true
+		}
 	}
-	return false
+	return
 }

 func findMatches(data []byte) (loc []int) {