From 2f3af7fc8e23e7f58b184f8ea39668c2f0cb5add Mon Sep 17 00:00:00 2001 From: Laurence Withers Date: Sat, 13 May 2023 13:17:54 +0100 Subject: [PATCH] Improved binary / minified file detection heuristics --- file.go | 98 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 38 deletions(-) diff --git a/file.go b/file.go index a3311d0..615ce10 100644 --- a/file.go +++ b/file.go @@ -12,6 +12,20 @@ import ( "unicode/utf8" ) +const ( + // if a matching line is longer than longLine runes then we will print + // a truncated form + longLine = 256 + + // if any line in the file is detected to be longer than this many runes + // we will count the file as being minified + minifiedLine = 1024 + + // number of bytes at the start / end of each file to examine for binary + // or minified file detection. + bytesToExamine = 8192 +) + type notPlainTextBehaviour int const ( @@ -56,8 +70,9 @@ func (nptf *notPlainTextFlag) Type() string { func file(path string, data []byte) { var short string + isBinary, isMinified := notPlainText(data) switch { - case isBinary(data): + case isBinary: switch binaryFile.b { case notPlainTextShort: short = "Binary" @@ -65,7 +80,7 @@ func file(path string, data []byte) { return } - case isMinified(data): + case isMinified: switch minifiedFile.b { case notPlainTextShort: short = "Minified" @@ -194,55 +209,62 @@ func file(path string, data []byte) { } } -func isBinary(data []byte) bool { - bytesToExamine := 4096 - - for bytesToExamine > 0 { - r, s := utf8.DecodeRune(data) - switch { - case s == 0: - // end of string - return false - case s == 1 && r == utf8.RuneError: - // invalid UTF-8 - return true - case r == '\r', r == '\n', r == '\t': - // valid control chars - case r < ' ': - // invalid control chars - return true - } - data = data[s:] - bytesToExamine -= s +func notPlainText(data []byte) (isBinary, isMinified bool) { + // examine bytes at the start of the file for binary data + b, m := notPlainTextAux(data) + if b || len(data) < bytesToExamine*2 { + return b, m } - return false + // some files, like .a files, have a header which passes plaintext + // detection but we can expect the trailer to be binary + data = data[len(data)-bytesToExamine:] + for i := 0; i < 5; i++ { // attempt to align to UTF-8 char boundary + if utf8.RuneStart(data[0]) { + break + } + data = data[1:] + } + b2, m2 := notPlainTextAux(data) + return (b || b2), (m || m2) } -const longLine = 256 - -func isMinified(data []byte) bool { - bytesToExamine := 4096 +func notPlainTextAux(data []byte) (isBinary, isMinified bool) { + n := bytesToExamine var lineLength int - - for bytesToExamine > 0 { + for n > 0 { r, s := utf8.DecodeRune(data) switch { case s == 0: // end of string - return false + return + + case s == 1 && r == utf8.RuneError: + // invalid UTF-8 + isBinary = true + return + case r == '\n': - lineLength = 0 - default: - lineLength++ - if lineLength >= longLine { - return true - } + // newline + lineLength = -1 + + case r == '\r', r == '\t', r == '\v': + // valid control chars often present in text + + case r < ' ': + // control chars not expected in plain text + isBinary = true + return } + data = data[s:] - bytesToExamine -= s + n -= s + lineLength++ + if lineLength >= minifiedLine { + isMinified = true + } } - return false + return } func findMatches(data []byte) (loc []int) {