compressible.go 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. package compress
  2. import "math"
  3. // Estimate returns a normalized compressibility estimate of block b.
  4. // Values close to zero are likely uncompressible.
  5. // Values above 0.1 are likely to be compressible.
  6. // Values above 0.5 are very compressible.
  7. // Very small lengths will return 0.
  8. func Estimate(b []byte) float64 {
  9. if len(b) < 16 {
  10. return 0
  11. }
  12. // Correctly predicted order 1
  13. hits := 0
  14. lastMatch := false
  15. var o1 [256]byte
  16. var hist [256]int
  17. c1 := byte(0)
  18. for _, c := range b {
  19. if c == o1[c1] {
  20. // We only count a hit if there was two correct predictions in a row.
  21. if lastMatch {
  22. hits++
  23. }
  24. lastMatch = true
  25. } else {
  26. lastMatch = false
  27. }
  28. o1[c1] = c
  29. c1 = c
  30. hist[c]++
  31. }
  32. // Use x^0.6 to give better spread
  33. prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
  34. // Calculate histogram distribution
  35. variance := float64(0)
  36. avg := float64(len(b)) / 256
  37. for _, v := range hist {
  38. Δ := float64(v) - avg
  39. variance += Δ * Δ
  40. }
  41. stddev := math.Sqrt(float64(variance)) / float64(len(b))
  42. exp := math.Sqrt(1 / float64(len(b)))
  43. // Subtract expected stddev
  44. stddev -= exp
  45. if stddev < 0 {
  46. stddev = 0
  47. }
  48. stddev *= 1 + exp
  49. // Use x^0.4 to give better spread
  50. entropy := math.Pow(stddev, 0.4)
  51. // 50/50 weight between prediction and histogram distribution
  52. return math.Pow((prediction+entropy)/2, 0.9)
  53. }
  54. // ShannonEntropyBits returns the number of bits minimum required to represent
  55. // an entropy encoding of the input bytes.
  56. // https://en.wiktionary.org/wiki/Shannon_entropy
  57. func ShannonEntropyBits(b []byte) int {
  58. if len(b) == 0 {
  59. return 0
  60. }
  61. var hist [256]int
  62. for _, c := range b {
  63. hist[c]++
  64. }
  65. shannon := float64(0)
  66. invTotal := 1.0 / float64(len(b))
  67. for _, v := range hist[:] {
  68. if v > 0 {
  69. n := float64(v)
  70. shannon += math.Ceil(-math.Log2(n*invTotal) * n)
  71. }
  72. }
  73. return int(math.Ceil(shannon))
  74. }