Improved binary / minified file detection heuristics
This commit is contained in:
		
							parent
							
								
									5790d3ab5f
								
							
						
					
					
						commit
						2f3af7fc8e
					
				
							
								
								
									
										98
									
								
								file.go
								
								
								
								
							
							
						
						
									
										98
									
								
								file.go
								
								
								
								
							|  | @ -12,6 +12,20 @@ import ( | |||
| 	"unicode/utf8" | ||||
| ) | ||||
| 
 | ||||
| const ( | ||||
| 	// if a matching line is longer than longLine runes then we will print
 | ||||
| 	// a truncated form
 | ||||
| 	longLine = 256 | ||||
| 
 | ||||
| 	// if any line in the file is detected to be longer than this many runes
 | ||||
| 	// we will count the file as being minified
 | ||||
| 	minifiedLine = 1024 | ||||
| 
 | ||||
| 	// number of bytes at the start / end of each file to examine for binary
 | ||||
| 	// or minified file detection.
 | ||||
| 	bytesToExamine = 8192 | ||||
| ) | ||||
| 
 | ||||
| type notPlainTextBehaviour int | ||||
| 
 | ||||
| const ( | ||||
|  | @ -56,8 +70,9 @@ func (nptf *notPlainTextFlag) Type() string { | |||
| 
 | ||||
| func file(path string, data []byte) { | ||||
| 	var short string | ||||
| 	isBinary, isMinified := notPlainText(data) | ||||
| 	switch { | ||||
| 	case isBinary(data): | ||||
| 	case isBinary: | ||||
| 		switch binaryFile.b { | ||||
| 		case notPlainTextShort: | ||||
| 			short = "Binary" | ||||
|  | @ -65,7 +80,7 @@ func file(path string, data []byte) { | |||
| 			return | ||||
| 		} | ||||
| 
 | ||||
| 	case isMinified(data): | ||||
| 	case isMinified: | ||||
| 		switch minifiedFile.b { | ||||
| 		case notPlainTextShort: | ||||
| 			short = "Minified" | ||||
|  | @ -194,55 +209,62 @@ func file(path string, data []byte) { | |||
| 	} | ||||
| } | ||||
| 
 | ||||
| func isBinary(data []byte) bool { | ||||
| 	bytesToExamine := 4096 | ||||
| 
 | ||||
| 	for bytesToExamine > 0 { | ||||
| 		r, s := utf8.DecodeRune(data) | ||||
| 		switch { | ||||
| 		case s == 0: | ||||
| 			// end of string
 | ||||
| 			return false | ||||
| 		case s == 1 && r == utf8.RuneError: | ||||
| 			// invalid UTF-8
 | ||||
| 			return true | ||||
| 		case r == '\r', r == '\n', r == '\t': | ||||
| 			// valid control chars
 | ||||
| 		case r < ' ': | ||||
| 			// invalid control chars
 | ||||
| 			return true | ||||
| 		} | ||||
| 		data = data[s:] | ||||
| 		bytesToExamine -= s | ||||
| func notPlainText(data []byte) (isBinary, isMinified bool) { | ||||
| 	// examine bytes at the start of the file for binary data
 | ||||
| 	b, m := notPlainTextAux(data) | ||||
| 	if b || len(data) < bytesToExamine*2 { | ||||
| 		return b, m | ||||
| 	} | ||||
| 
 | ||||
| 	return false | ||||
| 	// some files, like .a files, have a header which passes plaintext
 | ||||
| 	// detection but we can expect the trailer to be binary
 | ||||
| 	data = data[len(data)-bytesToExamine:] | ||||
| 	for i := 0; i < 5; i++ { // attempt to align to UTF-8 char boundary
 | ||||
| 		if utf8.RuneStart(data[0]) { | ||||
| 			break | ||||
| 		} | ||||
| 		data = data[1:] | ||||
| 	} | ||||
| 	b2, m2 := notPlainTextAux(data) | ||||
| 	return (b || b2), (m || m2) | ||||
| } | ||||
| 
 | ||||
| const longLine = 256 | ||||
| 
 | ||||
| func isMinified(data []byte) bool { | ||||
| 	bytesToExamine := 4096 | ||||
| func notPlainTextAux(data []byte) (isBinary, isMinified bool) { | ||||
| 	n := bytesToExamine | ||||
| 	var lineLength int | ||||
| 
 | ||||
| 	for bytesToExamine > 0 { | ||||
| 	for n > 0 { | ||||
| 		r, s := utf8.DecodeRune(data) | ||||
| 		switch { | ||||
| 		case s == 0: | ||||
| 			// end of string
 | ||||
| 			return false | ||||
| 			return | ||||
| 
 | ||||
| 		case s == 1 && r == utf8.RuneError: | ||||
| 			// invalid UTF-8
 | ||||
| 			isBinary = true | ||||
| 			return | ||||
| 
 | ||||
| 		case r == '\n': | ||||
| 			lineLength = 0 | ||||
| 		default: | ||||
| 			lineLength++ | ||||
| 			if lineLength >= longLine { | ||||
| 				return true | ||||
| 			} | ||||
| 			// newline
 | ||||
| 			lineLength = -1 | ||||
| 
 | ||||
| 		case r == '\r', r == '\t', r == '\v': | ||||
| 			// valid control chars often present in text
 | ||||
| 
 | ||||
| 		case r < ' ': | ||||
| 			// control chars not expected in plain text
 | ||||
| 			isBinary = true | ||||
| 			return | ||||
| 		} | ||||
| 
 | ||||
| 		data = data[s:] | ||||
| 		bytesToExamine -= s | ||||
| 		n -= s | ||||
| 		lineLength++ | ||||
| 		if lineLength >= minifiedLine { | ||||
| 			isMinified = true | ||||
| 		} | ||||
| 	} | ||||
| 	return false | ||||
| 	return | ||||
| } | ||||
| 
 | ||||
| func findMatches(data []byte) (loc []int) { | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue